def main():
    info(' '.join(sys.argv))
    info()

    cnf, bcbio_structure = bcbio_summary_script_proc_params(
        BCBioStructure.targqc_name,
        BCBioStructure.targqc_summary_dir,
        extra_opts=
        [(['--bed', '--capture', '--amplicons'],
          dict(dest='bed',
               help='BED file to run targetSeq and Seq2C analysis on.')),
         (['--exons', '--exome', '--features'],
          dict(
              dest='features',
              help=
              'Annotated CDS/Exons/Gene/Transcript BED file to make targetSeq exon/amplicon regions reports.'
          ))])

    bed_fpath, features_bed_fpath = adjust_path(cnf.bed), adjust_path(
        cnf.features)

    summarize_targqc(cnf,
                     cnf.threads or len(bcbio_structure.samples),
                     cnf.output_dir,
                     bcbio_structure.samples,
                     bed_fpath=bed_fpath,
                     features_fpath=features_bed_fpath)
Esempio n. 2
0
def get_chr_lengths_from_seq(seq_fpath):
    chr_lengths = []

    if seq_fpath.endswith('.fai'):
        seq_fpath = splitext(seq_fpath)[0]

    if verify_file(seq_fpath + '.fai', silent=True):
        info('Reading genome index file (.fai) to get chromosome lengths')
        with open(adjust_path(seq_fpath + '.fai'), 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], line.split()[1]
                    chr_lengths.append((chrom, length))
    elif verify_file(seq_fpath, silent=True):
        info('Reading genome sequence (.fa) to get chromosome lengths')
        with open(adjust_path(seq_fpath), 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))
    else:
        critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai')
    return chr_lengths
def check_dirs_and_files(cnf, file_keys=list(), dir_keys=list()):
    errors = []

    def _verify_input_file(_key):
        cnf[_key] = adjust_path(cnf[_key])
        if not verify_file(cnf[_key], _key):
            return False
        if 'bam' in _key and not verify_bam(cnf[_key]):
            return False
        if 'bed' in _key and not verify_bed(cnf[_key]):
            return False
        return True

    for key in file_keys:
        if key and key in cnf and cnf[key]:
            if not _verify_input_file(key):
                errors.append('File ' + cnf[key] +
                              ' is empty or cannot be found')
            else:
                cnf[key] = adjust_path(cnf[key])

    for key in dir_keys:
        if key and key in cnf and cnf[key]:
            cnf[key] = adjust_path(cnf[key])
            if not verify_dir(cnf[key], key):
                errors.append('Directory ' + cnf[key] +
                              ' is empty or cannot be found')
            else:
                cnf[key] = adjust_path(cnf[key])

    return errors
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int',
        help='Downsample reads to avoid excessive processing times with large files. '
            'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()

    if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir:
        parser.print_usage()

    verify_file(opts.left_reads_fpath, is_critical=False)
    left_reads_fpath = adjust_path(opts.left_reads_fpath)
    verify_file(opts.right_reads_fpath, is_critical=False)
    right_reads_fpath = adjust_path(opts.right_reads_fpath)
    output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True)

    left_reads_fpath, right_reads_fpath, output_dirpath =\
        map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath])

    ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******')
    fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py')
    fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM)
    fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM)

    cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}'
    if opts.sample_name:
        cmdl += ' --sample {opts.sample_name}'
    if opts.downsample_to:
        cmdl += ' --downsample-to ' + str(int(opts.downsample_to))
    cmdl = cmdl.format(**locals())
    cmdl += ' 2>&1'
    info(cmdl)
    stdin, stdout, stderr = ssh.exec_command(cmdl)
    for l in stdout:
        err(l, ending='')
    info()
    ssh.close()
Esempio n. 5
0
def print_genes(genes, output_fpath, canon_only):
    regions = []
    already_added_gene_features = set()
    transcripts = []
    for g in genes:
        for tr in g.transcripts:
            if not canon_only or tr.is_canonical:
                transcripts.append(tr)
    for tr in sorted(transcripts, key=lambda _tr: _tr.get_key()):
        to_add_gene = all(tr2.biotype == 'protein_coding' for tr2 in tr.gene.transcripts if (tr2.is_canonical or not canon_only)) \
                      and tr.gene not in already_added_gene_features \
                      and (len(tr.gene.canonical_transcripts) == 1 or len(tr.gene.transcripts) == 1)
        if to_add_gene:
            # skip gene feature for all miRNA because there are multi-domain miRNA located in different
            # places with the same gene name
            regions.append(tr.gene)
            already_added_gene_features.add(tr.gene)
        if tr.exons:
            regions.append(tr)
            for e in tr.exons:
                regions.append(e)

    info('Writing ' + str(len(regions)) + ' regions')
    with open(adjust_path(output_fpath), 'w') as all_out:
        for r in regions:
            all_out.write(r.__str__())
def get_transcipts_with_exons_from_features(features_file, cur_chrom=None):
    transcripts = defaultdict(list)
    with open_gzipsafe(adjust_path(features_file)) as in_f:
        for line in in_f:
            if line.startswith('#'):
                continue
            fields = line.strip('\n').split('\t')

            chrom = fields[0]
            if cur_chrom and chrom != cur_chrom:
                continue

            feature_type = fields[6]
            if feature_type not in ['Exon', 'CDS', 'UTR']:
                continue

            start = int(fields[1])
            stop = int(fields[2])
            transcript_id = fields[8]

            exon = {
                'transcript_id': transcript_id,
                'chrom': chrom,
                'start': start,
                'stop': stop
            }
            transcripts[(transcript_id, chrom)].append(exon)
    return transcripts
Esempio n. 7
0
def main():
    if len(sys.argv) < 2:
        sys.stderr.write('The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.\n')
        sys.stderr.write('When the gene name is found in HGNC, it get replaced with an approved name.\n')
        sys.stderr.write('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.\n')
        sys.stderr.write('\n')
        sys.stderr.write('Usage:\n')
        sys.stderr.write('    ' + __file__ + ' Ensembl.gtf [HGNC_cBio_genes.tsv] [additional_feature_list] > Exons.bed\n')
        sys.stderr.write('\n')
        sys.stderr.write('   where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:\n')
        sys.stderr.write('     #Approved Symbol  Previous Symbols                    Synonyms                          Chromosome   Ensembl Gene ID   UCSC ID(supplied by UCSC)\n')
        sys.stderr.write('     OR7E26P           OR7E67P, OR7E69P, OR7E70P, OR7E68P  OR1-51, OR1-72, OR1-73, OR912-95  19q13.43	    ENSG00000121410   uc002qsg.3\n')
        sys.stderr.write('     ...\n')
        sys.stderr.write('\n')
        sys.stderr.write('   feature_list is by default empty, but could be transcript\n')
        sys.stderr.write('\n')
        sys.stderr.write('   and UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:\n')
        sys.stderr.write('     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol\n')
        sys.stderr.write('     uc001aaa.3	          chr1	                +	                   11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1\n')
        sys.stderr.write('     ...\n')
        sys.stderr.write('   or Ensembl.gtf (ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz)')
        sys.stderr.write('     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";')
        sys.stderr.write('     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";')
        sys.stderr.write('     ...\n')
        sys.stderr.write('\n')
        sys.stderr.write('   Writes to Exons.bed\n')
        sys.stderr.write('\n')
        sys.stderr.write('See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols\n')
        sys.exit(1)

    # if is_local():
    #     sys.stderr.write('Local: will run only for chr21\n')
    #     sys.stderr.write('\n')

    input_fpath = adjust_path(sys.argv[1])
    hgnc_fpath = adjust_path(sys.argv[2])
    approved_gene_by_name = None
    if hgnc_fpath and hgnc_fpath != "''":
        sys.stderr.write('Synonyms file provided ' + hgnc_fpath + '\n')
        approved_gene_by_name = read_hgnc_genes(hgnc_fpath)
    else:
        sys.stderr.write('No synonyms file provided, skipping approving\n')

    out = sys.stdout
    with open(input_fpath) as inp:
        _ = inp.readline()
        not_approved_gene_names = _proc_ensembl(inp, out, approved_gene_by_name)
Esempio n. 8
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Esempio n. 9
0
def get_bed_targqc_inputs(cnf, bed_fpath=None):
    if bed_fpath:
        bed_fpath = verify_bed(bed_fpath,
                               description='Input BED file',
                               is_critical=True)
        info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(cnf.features or cnf.genome.features)
    if features_bed_fpath:
        info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    return bed_fpath, features_bed_fpath, genes_fpath
 def _verify_input_file(_key):
     cnf[_key] = adjust_path(cnf[_key])
     if not verify_file(cnf[_key], _key):
         return False
     if 'bam' in _key and not verify_bam(cnf[_key]):
         return False
     if 'bed' in _key and not verify_bed(cnf[_key]):
         return False
     return True
Esempio n. 11
0
def main():
    cnf, vcf2txt_res_fpath = get_args()

    info('-' * 70)
    info('Writing to ' + cnf.output_file)
    if cnf.all_transcripts_output_file:
        info('Writing info for all transcripts to ' +
             cnf.all_transcripts_output_file)
    if cnf.fm_output_file:
        info('Writing in FM format to ' + cnf.fm_output_file)
    if cnf.rejected_output_file:
        info('Writing rejected mutations to ' + cnf.rejected_output_file)

    f = Filtration(cnf)

    input_f = open(verify_file(vcf2txt_res_fpath))
    output_f = open(adjust_path(cnf.output_file), 'w')
    rejected_output_f = open(adjust_path(cnf.rejected_output_file),
                             'w') if cnf.rejected_output_file else None
    fm_output_f = open(adjust_path(cnf.fm_output_file),
                       'w') if cnf.fm_output_file else None
    all_transcripts_output_f = open(
        adjust_path(cnf.all_transcripts_output_file),
        'w') if cnf.all_transcripts_output_file else None

    info()
    info('-' * 70)
    info('Running filtering...')
    f.do_filtering(input_f, output_f, fm_output_f, all_transcripts_output_f,
                   rejected_output_f)

    input_f.close()
    output_f.close()
    if fm_output_f:
        fm_output_f.close()
    if all_transcripts_output_f:
        all_transcripts_output_f.close()

    info()
    if cnf.rejected_output_file:
        info('Rejected mutations saved to ' + cnf.rejected_output_file)
    info('Saved to ' + cnf.output_file)
Esempio n. 12
0
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None):
    input_bed_fpath = verify_bed(input_bed_fpath)
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else intermediate_fname(
            cnf, input_bed_fpath, 'sorted')

    class Region(SortableByChrom):
        def __init__(self, chrom, start, end, other_fields, chrom_ref_order):
            SortableByChrom.__init__(self, chrom, chrom_ref_order)
            self.start = start
            self.end = end
            self.chrom_ref_order = chrom_ref_order
            self.other_fields = tuple(other_fields)

        def get_key(self):
            return self.chrom_ref_order, self.start, self.end, self.other_fields

    regions = []
    chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    info('Sorting regions in ' + input_bed_fpath)
    if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed(
            output_bed_fpath):
        info(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    info('Sorted ' + str(len(regions)) + ' regions, saved to ' +
         output_bed_fpath + '\n')
    return output_bed_fpath
def determine_run_cnf(opts, is_wgs=False, is_targetseq=False):
    if opts.run_cnf:
        opts.run_cnf = adjust_path(opts.run_cnf)
    elif is_wgs:
        opts.run_cnf = defaults['run_cnf_wgs']
    elif is_targetseq:
        opts.run_cnf = defaults['run_cnf_deep_seq']
    else:
        opts.run_cnf = defaults['run_cnf_exome_seq']

    verify_file(opts.run_cnf, is_critical=True)
    debug('Using run configuration ' + opts.run_cnf)
    return opts.run_cnf
Esempio n. 14
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=None,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True)
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        sample_name = cnf.sample_name
        if not sample_name:
            sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath)
        results_dirpath = run_fastq(cnf,
                                    sample_name,
                                    left_reads_fpath,
                                    right_reads_fpath,
                                    output_dirpath,
                                    downsample_to=cnf.downsample_to)

    verify_dir(results_dirpath, is_critical=True)
    info()
    info('*' * 70)
    info('Fastqc results:')
    info('  ' + results_dirpath)
Esempio n. 15
0
def verify_bed(fpath, description='', is_critical=False, silent=False):
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    error = BedFile(fpath).checkformat()
    if error:
        fn = critical if is_critical else err
        fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) +
           '\n')
        return None

    return fpath
def main():
    args = sys.argv[1:]

    if len(args) < 2:
        sys.exit('Usage: ' + __file__ + ' sambamba_depth_report sample_name bed_col_num')

    bedcov_hist_fpath, sample_name, bed_col_num = args

    amplicons = summarize_bedcoverage_hist_stats(adjust_path(bedcov_hist_fpath), sample_name, int(bed_col_num))

    amplicons = sorted(amplicons, key=lambda a: (a.chrom, a.gene_name, a.start))

    for r in amplicons:
        r.calc_avg_depth()

    save_regions_to_seq2cov_output__nocnf(sample_name, amplicons)
Esempio n. 17
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
Esempio n. 18
0
def main():
    parser = OptionParser(usage='Usage: ' + basename(__file__) +
                          ' -o Output_BED_file -g hg19 Input_BED_file')
    parser.add_option('-o', '--output-bed', dest='output_fpath')
    parser.add_option('-g', '--genome', dest='genome')
    (opts, args) = parser.parse_args(sys.argv[1:])

    if len(args) < 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    check_genome_resources(cnf)

    if not cnf.output_fpath:
        critical(parser.usage)

    sort_bed(cnf, verify_bed(args[0], is_critical=True),
             adjust_path(cnf.output_fpath))
Esempio n. 19
0
def get_chr_len_fpath(cnf):
    chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt')
    if cnf.reuse_intermediate and file_exists(chr_len_fpath):
        info(chr_len_fpath + ' exists, reusing')
        return chr_len_fpath

    else:
        if not cnf.genome.seq:
            critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' +
                     cnf.genome.name + '" section')
            return None

        chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq))

        with file_transaction(cnf.work_dir, chr_len_fpath) as tx:
            with open(tx, 'w') as handle:
                for c, l in chr_lengths:
                    handle.write(c + '\t' + str(l) + '\n')
    return chr_len_fpath
def set_up_dirs(cnf, log_dir_name='log'):
    """ Creates output_dir, work_dir; sets up log
    """
    if cnf.output_dir:
        cnf.output_dir = adjust_path(cnf.output_dir)
        safe_mkdir(cnf.output_dir, 'output_dir')
        info('Saving into ' + cnf.output_dir)

    set_up_work_dir(cnf)

    if cnf.log_dir == '-':
        cnf.log_dir = None
    else:
        if not cnf.log_dir:
            cnf.log_dir = join(cnf.work_dir, log_dir_name)
        safe_mkdir(cnf.log_dir)
        info('Created log dir ' + cnf.log_dir)

    set_up_log(cnf)
def _verify_sample_info(vcf_conf, vcf_header_samples):
    if 'samples' in vcf_conf:
        for header_sample_name, sample_conf in vcf_conf['samples'].items():
            join_parent_conf(sample_conf, vcf_conf)

            bam = sample_conf.get('bam')
            if bam and not verify_file(bam, 'Bam file'):
                exit()
            sample_conf['bam'] = adjust_path(bam)

    sample_cnfs = vcf_conf.get('samples') or OrderedDict()

    # compare input sample names to vcf header
    if sample_cnfs:
        for input_sample_name, sample_conf in sample_cnfs.items():
            if input_sample_name not in vcf_header_samples:
                critical('ERROR: sample ' + input_sample_name +
                         ' is not in VCF header ' + vcf_header_samples + '\n'
                         'Available samples: ' + ', '.join(vcf_header_samples))
    return sample_cnfs
Esempio n. 22
0
def get_args():
    info(' '.join(sys.argv))
    info()
    parser = OptionParser()
    parser.add_option('-o', dest='output_file')

    parser.set_usage('Usage: ' + __file__ + ' dbsnp.vcf.gz -o output_fpath')

    (opts, args) = parser.parse_args()
    if len(args) < 1:
        critical("Provide the first argument - path to dbsnp VCF")

    vcf2txt_res_fpath = verify_file(args[0])

    if not opts.output_file:
        critical('Please, specify the output fpath with -o')

    info()

    return vcf2txt_res_fpath, adjust_path(opts.output_file)
def set_up_work_dir(cnf):
    # timestamp = str(datetime.datetime.now())
    # user_prid = getpass.getuser()
    # hasher = hashlib.sha1( + timestamp)
    # path_hash = base64.urlsafe_b64encode(hasher.digest()[0:4])[:-1]

    if not cnf.work_dir:
        if cnf.output_dir:
            work_dir_name = 'work' + ('_' + cnf.sample if cnf.sample else '')
            cnf.work_dir = join(cnf.output_dir, work_dir_name)
            info('Work dir: ' + cnf.work_dir)
            # if not cnf.reuse_intermediate and isdir(cnf.work_dir):
            #     rmtree(cnf.work_dir)
        else:
            cnf.work_dir = tempfile.mkdtemp()
            info('Creating temprorary directory for work dir: ' + cnf.work_dir)
    else:
        cnf.work_dir = adjust_path(cnf.work_dir)
        info('Work dir: ' + cnf.work_dir)

    safe_mkdir(cnf.work_dir, 'working directory')
Esempio n. 24
0
def verify_bam(fpath, description='', is_critical=False, silent=False):
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    logfn = critical if is_critical else err
    if not fpath.endswith('.bam'):
        logfn('The file ' + fpath +
              ' is supposed to be BAM but does not have the .bam '
              'extension. Please, make sure you pass proper file.')
        return None

    textchars = ''.join(
        map(chr, [7, 8, 9, 10, 12, 13, 27] + range(0x20, 0x100)))
    is_binary_string = lambda baitiki: bool(baitiki.translate(None, textchars))
    if not is_binary_string(open(fpath).read(3)):
        logfn('The BAM file ' + fpath + ' must be a binary file.')
        return None

    return fpath
Esempio n. 25
0
def sort_bed_by_alphabet(cnf,
                         input_bed_fpath,
                         output_bed_fpath=None,
                         chr_len_fpath=None):
    chr_lengths = get_chr_lengths(cnf, chr_len_fpath)
    chromosomes = set([c for (c, l) in chr_lengths])
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else add_suffix(
            input_bed_fpath, 'sorted')

    regions = defaultdict(list)

    info('Sorting regions...')
    chunk_size = 10
    chunk_counter = 0
    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    if chrom not in chromosomes:
                        continue
                    if chunk_counter == chunk_size or not regions[chrom]:
                        chunk_counter = 0
                        regions[chrom].append('')
                    regions[chrom][-1] += l
                    chunk_counter += 1
                for chr in sorted(regions.keys()):
                    for region in regions[chr]:
                        out.write(region)

    return output_bed_fpath
def main():
    info(' '.join(sys.argv))
    info()
    description = 'This script converts Vardict TXT file to VCF.'

    parser = OptionParser(
        description=description,
        usage='Usage: ' + basename(__file__) +
        ' [-o Output_directory -c Var_caller_name] Project_directory')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir', default='-')
    parser.add_option('-c', '--caller', dest='caller_name', default='vardict')
    parser.add_option('-o', dest='output_dir', help='Output directory.')

    cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \
        = process_post_bcbio_args(parser)

    if not bcbio_project_dirpaths:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    bcbio_structures = []
    for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip(
            bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths):
        bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf,
                            final_dirpath)
        bcbio_structures.append(bs)

    cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work'))
    safe_mkdir(cnf.work_dir)

    info('')
    info('*' * 70)
    for bs in bcbio_structures:
        for sample in bs.samples:
            if sample.phenotype != 'normal':
                convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample)
Esempio n. 27
0
def _read_args(args_list):
    options = [
        # (['-k', '--key-genes'], dict(
        #     dest='key_genes_fpath',
        #     help='list of key genes (they are at top priority when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt')
        #  ),
        # (['-a', '--approved-genes'], dict(
        #     dest='approved_genes_fpath',
        #     help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt')
        #  ),
        # (['-e', '--ensembl-bed'], dict(
        #     dest='ensembl_bed_fpath',
        #     help='reference BED file for annotation (Ensembl)')
        #  ),
        # (['-r', '--refseq-bed'], dict(
        #     dest='refseq_bed_fpath',
        #     help='reference BED file for annotation (RefSeq)')
        #  ),
        # (['-b', '--bedtools'], dict(
        #     dest='bedtools',
        #     help='path to bedtools',
        #     default='bedtools')
        #  ),
        (['-o', '--output-bed'], dict(
            dest='output_fpath')
         ),
        (['--debug'], dict(
            dest='debug',
            help='run in a debug more (verbose output, keeping of temporary files)',
            default=False,
            action='store_true')
         ),
        (['--output-hg'], dict(
            dest='output_hg',
            help='output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)',
            default=False,
            action='store_true')
         ),
        (['--output-grch'], dict(
            dest='output_grch',
            help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)',
            default=False,
            action='store_true')
         ),
        (['-g', '--genome'], dict(
            dest='genome',
            default='hg19')
         ),
    ]

    parser = OptionParser(usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file',
                          description='Scripts outputs a standardized version of input BED file. '
                                      'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);'
                                      ' 2) has HGNC approved symbol in forth column if annotation is '
                                      'possible and not_a_gene_X otherwise;'
                                      ' 3) is sorted based on chromosome name -> start -> end;'
                                      ' 4) has no duplicated regions (regions with the same chromosome, start and end), '
                                      'the only exception is _CONTROL_ regions.')
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    (opts, args) = parser.parse_args(args_list)

    if len(args) != 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    work_dirpath = tempfile.mkdtemp()
    info('Creating a temporary working directory ' + work_dirpath)
    if not exists(work_dirpath):
        os.mkdir(work_dirpath)

    input_bed_fpath = abspath(args[0])
    info('Input: ' + input_bed_fpath)

    output_bed_fpath = adjust_path(cnf.output_fpath)
    info('Writing to: ' + output_bed_fpath)

    # process configuration
    # for k, v in opts.__dict__.items():
    #     if k.endswith('fpath') and verify_file(v, is_critical=True):
    #         opts.__dict__[k] = verify_file(v, k)
    if cnf.output_grch and cnf.output_hg:
        info('you cannot specify --output-hg and --output-grch simultaneously!')
    # if not which(opts.bedtools):
    #     info('bedtools executable not found, please specify correct path (current is %s)! '
    #         'Did you forget to execute "module load bedtools"?' % opts.bedtools)

    # if opts.debug:
    #     info('Configuration: ')
    #     for k, v in opts.__dict__.items():
    #         info('\t' + k + ': ' + str(v))
    info()

    # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed'))

    # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed'))

    return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
def _read_args(args_list):
    options = [
        # (['-k', '--key-genes'], dict(
        #     dest='key_genes_fpath',
        #     help='list of key genes (they are at top priority when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt')
        #  ),
        # (['-a', '--approved-genes'], dict(
        #     dest='approved_genes_fpath',
        #     help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt')
        #  ),
        # (['-e', '--ensembl-bed'], dict(
        #     dest='ensembl_bed_fpath',
        #     help='reference BED file for annotation (Ensembl)')
        #  ),
        # (['-r', '--refseq-bed'], dict(
        #     dest='refseq_bed_fpath',
        #     help='reference BED file for annotation (RefSeq)')
        #  ),
        # (['-b', '--bedtools'], dict(
        #     dest='bedtools',
        #     help='path to bedtools',
        #     default='bedtools')
        #  ),
        (['-o', '--output-bed'], dict(dest='output_fpath')),
        (['--debug'],
         dict(
             dest='debug',
             help=
             'run in a debug more (verbose output, keeping of temporary files)',
             default=False,
             action='store_true')),
        (['--output-hg'],
         dict(
             dest='output_hg',
             help=
             'output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)',
             default=False,
             action='store_true')),
        (['--output-grch'],
         dict(
             dest='output_grch',
             help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)',
             default=False,
             action='store_true')),
        (['-g', '--genome'], dict(dest='genome', default='hg19')),
    ]

    parser = OptionParser(
        usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file',
        description='Scripts outputs a standardized version of input BED file. '
        'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);'
        ' 2) has HGNC approved symbol in forth column if annotation is '
        'possible and not_a_gene_X otherwise;'
        ' 3) is sorted based on chromosome name -> start -> end;'
        ' 4) has no duplicated regions (regions with the same chromosome, start and end), '
        'the only exception is _CONTROL_ regions.')
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    (opts, args) = parser.parse_args(args_list)

    if len(args) != 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    work_dirpath = tempfile.mkdtemp()
    info('Creating a temporary working directory ' + work_dirpath)
    if not exists(work_dirpath):
        os.mkdir(work_dirpath)

    input_bed_fpath = abspath(args[0])
    info('Input: ' + input_bed_fpath)

    output_bed_fpath = adjust_path(cnf.output_fpath)
    info('Writing to: ' + output_bed_fpath)

    # process configuration
    # for k, v in opts.__dict__.iteritems():
    #     if k.endswith('fpath') and verify_file(v, is_critical=True):
    #         opts.__dict__[k] = verify_file(v, k)
    if cnf.output_grch and cnf.output_hg:
        info(
            'you cannot specify --output-hg and --output-grch simultaneously!')
    # if not which(opts.bedtools):
    #     info('bedtools executable not found, please specify correct path (current is %s)! '
    #         'Did you forget to execute "module load bedtools"?' % opts.bedtools)

    # if opts.debug:
    #     info('Configuration: ')
    #     for k, v in opts.__dict__.iteritems():
    #         info('\t' + k + ': ' + str(v))
    info()

    # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed'))

    # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed'))

    return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath,
                 cnf, chrom_order):
    '''
    1. Sorts.
    1. Chooses appropriate number of columns (4 or 8 for BEDs with primers).
    2. Removes duplicates.
    '''
    info('postprocessing (sorting, cutting, removing duplicates)')

    key_genes = []
    with open(adjust_path(cnf.key_genes), 'r') as f:
        for line in f:
            key_genes.append(line.strip())
    approved_genes = []
    if cnf.hgnc:
        with open(adjust_path(cnf.hgnc), 'r') as f:
            f.readline()  # header
            for line in f:
                approved_genes.append(line.split('\t')[0])

    Region.GRCh_names = bed_params.GRCh_names
    if cnf.output_grch:
        Region.GRCh_names = True
        if cnf.debug and not bed_params.GRCh_names:
            info('Changing chromosome names from hg-style to GRCh-style.')
    if cnf.output_hg:
        Region.GRCh_names = False
        if cnf.debug and bed_params.GRCh_names:
            info('Changing chromosome names from GRCh-style to hg-style.')
    Region.n_cols_needed = bed_params.n_cols_needed
    Region.key_genes = key_genes
    Region.approved_genes = approved_genes

    input_regions = set()  # we want only unique regions
    with open(adjust_path(input_fpath)) as f:
        for line in f:
            entries = line.strip().split('\t')
            chrom = entries[0]
            start = int(entries[1])
            end = int(entries[2])
            r = Region(chrom, chrom_order.get(chrom), start, end)
            r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.
                         format(chrom, start, end))
            r.rest = entries[4:] if len(entries) > 4 else None
            input_regions.add(r)

    annotated_regions = []
    for annotated_fpath in annotated_fpaths:
        with open(adjust_path(annotated_fpath)) as f:
            for line in f:
                entries = line.strip().split('\t')
                chrom = entries[0]
                start = int(entries[1])
                end = int(entries[2])
                r = Region(chrom, chrom_order.get(chrom), start, end)
                r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.
                             format(chrom, start, end))
                r.rest = entries[4:] if len(entries) > 4 else None
                annotated_regions.append(r)

    # starting to output result
    with open(adjust_path(output_bed_fpath), 'w') as f:
        for line in bed_params.header:
            f.write(line)

        annotated_regions.sort()
        i = 0
        prev_region = None
        not_a_gene_count = 0
        solid_regions = []
        prev_is_solid = False
        all_regions = []
        for cur_region in sorted(list(input_regions) + bed_params.controls):
            if not cur_region.is_control():
                assert annotated_regions[i] == cur_region, str(
                    cur_region) + ' != ' + str(
                        annotated_regions[i]) + '(i=%d)' % i
                if annotated_regions[i].symbol != '.':
                    cur_region.set_symbol(annotated_regions[i].symbol)
                else:
                    if prev_region is None or \
                       prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"):
                        not_a_gene_count += 1
                    cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count)
                i += 1
                ambiguous_regions = [cur_region]
                while i < len(annotated_regions) and annotated_regions[
                        i] == cur_region:  # processing duplicates
                    if annotated_regions[i].symbol != '.' and annotated_regions[
                            i].symbol != cur_region.symbol:
                        duplicate = copy.deepcopy(cur_region)
                        duplicate.set_symbol(annotated_regions[i].symbol)
                        if duplicate.type == 'approved' and cur_region.type == 'not_approved':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                        elif annotated_regions[
                                i].type == 'key' and cur_region.type != 'key':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                            if cnf.debug:
                                info(
                                    'key gene priority over approved gene was used'
                                )
                        elif annotated_regions[i].type == cur_region.type:
                            ambiguous_regions.append(duplicate)
                    i += 1
                if len(ambiguous_regions) == 1:
                    if not prev_is_solid:
                        solid_regions.append(cur_region)
                    prev_is_solid = True
                    all_regions.append(cur_region)
                else:
                    if prev_is_solid:
                        solid_regions.append(prev_region)
                    prev_is_solid = False
                    all_regions.append(ambiguous_regions)
            else:
                all_regions.append(cur_region)
            prev_region = cur_region

        # outputting results
        cur_solid_id = -1
        for entry in all_regions:
            if isinstance(entry, list):  # list of ambiguous regions
                cur_region = entry[0]
                while cur_solid_id + 1 < len(
                        solid_regions) and cur_region > solid_regions[
                            cur_solid_id + 1]:
                    cur_solid_id += 1
                found = False
                if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \
                        and cur_region.chrom == solid_regions[cur_solid_id].chrom:
                    prev_solid = solid_regions[cur_solid_id]
                    for cur_region in entry:
                        if cur_region.symbol == prev_solid.symbol:
                            found = True
                            if cnf.debug:
                                info(
                                    'gene name was chosen based on previous solid region'
                                )
                            break
                if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \
                        and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom:
                    next_solid = solid_regions[cur_solid_id + 1]
                    for cur_region in entry:
                        if cur_region.symbol == next_solid.symbol:
                            found = True
                            if cnf.debug:
                                info(
                                    'gene name was chosen based on next solid region'
                                )
                            break
                if not found:
                    cur_region = entry[0]
            else:
                cur_region = entry
            f.write(
                str(cur_region) + '\n'
            )  # automatically outputs correct number of columns and GRCh/hg names
Esempio n. 30
0
def main():
    if len(sys.argv) < 4:
        info(
            'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.'
        )
        # info('When the gene name is found in HGNC, it get replaced with an approved name.         ')
        # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is.  ')
        info(
            '                                                                                      '
        )
        info(
            'Usage:                                                                                '
        )
        info('    ' + __file__ +
             ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path +
             '] [additional_feature_list]')
        info(
            '                                                                                      '
        )
        info(
            '     where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:'
        )
        info(
            '     #Approved Symbol  Previous Symbols                    Synonyms                          Chromosome   Ensembl Gene ID   UCSC ID(supplied by UCSC)'
        )
        info(
            '     OR7E26P           OR7E67P, OR7E69P, OR7E70P, OR7E68P  OR1-51, OR1-72, OR1-73, OR912-95  19q13.43	    ENSG00000121410   uc002qsg.3'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz'
        )
        info(
            '     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";'
        )
        info(
            '     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz'
        )
        info(
            '     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA'
        )
        info(
            '     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true'
        )
        info(
            '     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:'
        )
        info(
            '     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol'
        )
        info(
            '     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1'
        )
        info(
            '     ...                                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            '     Writes to Exons.bed                                                              '
        )
        info(
            '                                                                                      '
        )
        info(
            'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'
        )
        sys.exit(1)

    genome_name = sys.argv[1]
    seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath
    canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    input_fpath = verify_file(sys.argv[2])
    output_fpath = adjust_path(sys.argv[3])

    synonyms_fpath = None
    if len(sys.argv) > 4:
        synonyms_fpath = verify_file(sys.argv[4])
        info('Synonyms file provided ' + synonyms_fpath + '')
    else:
        info('No synonyms file provided, skipping approving')

    not_approved_fpath = None
    if len(sys.argv) > 5:
        not_approved_fpath = adjust_path(sys.argv[5])

    with open(verify_file(canonical_transcripts_fpath)) as f:
        canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f)

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        l = inp.readline()
        if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath,
                                                       chr_order)
        elif output_fpath.endswith('.gff3') or output_fpath.endswith(
                '.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath,
                                                       chr_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order)

    if synonyms_fpath and synonyms_fpath != "''":
        gene_by_name_and_chrom, not_approved_gene_names = _approve(
            gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) +
             ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

        # with open('serialized_genes.txt', 'w') as f:
        #     for g in gene_by_name.values():
        #         f.write(str(g) + '\t' + str(g.db_id) + '\n')
        #         for e in g.exons:
        #             f.write('\t' + str(e) + '\n')

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_and_mirna_genes = [
        g for g in genes
        if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts)
    ]

    coding_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'protein_coding' for t in g.transcripts)
    ]
    coding_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'protein_coding'
    ]
    mirna_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA' for t in g.transcripts)
    ]
    mirna_transcripts = [
        t for g in coding_and_mirna_genes for t in g.transcripts
        if t.biotype == 'miRNA'
    ]
    codingmiRNA_genes = [
        g for g in coding_and_mirna_genes
        if any(t.biotype == 'miRNA'
               for t in g.transcripts) and any(t.biotype == 'protein_coding'
                                               for t in g.transcripts)
    ]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(mirna_genes)) + ' miRNA genes')
    info('  ' + str(len(mirna_transcripts)) + ' miRNA transcripts')
    info('  ' + str(len(codingmiRNA_genes)) +
         ' genes with both coding and miRNA transcripts')

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    print_genes(genes, output_fpath, canon_only=False)

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = add_suffix(output_fpath, 'canon')
    print_genes(canon_genes, canon_output_fpath, canon_only=True)

    info()
    info('Saved all regions to\n   ' + output_fpath + '\n   ' +
         canon_output_fpath)
Esempio n. 31
0
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order):
    '''
    1. Sorts.
    1. Chooses appropriate number of columns (4 or 8 for BEDs with primers).
    2. Removes duplicates.
    '''
    info('postprocessing (sorting, cutting, removing duplicates)')

    key_genes = []
    with open(adjust_path(cnf.key_genes), 'r') as f:
        for line in f:
            key_genes.append(line.strip())
    approved_genes = []
    if cnf.hgnc:
        with open(adjust_path(cnf.hgnc), 'r') as f:
            f.readline()  # header
            for line in f:
                approved_genes.append(line.split('\t')[0])

    Region.GRCh_names = bed_params.GRCh_names
    if cnf.output_grch:
        Region.GRCh_names = True
        if cnf.debug and not bed_params.GRCh_names:
            info('Changing chromosome names from hg-style to GRCh-style.')
    if cnf.output_hg:
        Region.GRCh_names = False
        if cnf.debug and bed_params.GRCh_names:
            info('Changing chromosome names from GRCh-style to hg-style.')
    Region.n_cols_needed = bed_params.n_cols_needed
    Region.key_genes = key_genes
    Region.approved_genes = approved_genes

    input_regions = set()  # we want only unique regions
    with open(adjust_path(input_fpath)) as f:
        for line in f:
            entries = line.strip().split('\t')
            chrom = entries[0]
            start = int(entries[1])
            end = int(entries[2])
            r = Region(chrom, chrom_order.get(chrom), start, end)
            r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
            r.rest = entries[4:] if len(entries) > 4 else None
            input_regions.add(r)

    annotated_regions = []
    for annotated_fpath in annotated_fpaths:
        with open(adjust_path(annotated_fpath)) as f:
            for line in f:
                entries = line.strip().split('\t')
                chrom = entries[0]
                start = int(entries[1])
                end = int(entries[2])
                r = Region(chrom, chrom_order.get(chrom), start, end)
                r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
                r.rest = entries[4:] if len(entries) > 4 else None
                annotated_regions.append(r)

    # starting to output result
    with open(adjust_path(output_bed_fpath), 'w') as f:
        for line in bed_params.header:
            f.write(line)

        annotated_regions.sort()
        i = 0
        prev_region = None
        not_a_gene_count = 0
        solid_regions = []
        prev_is_solid = False
        all_regions = []
        for cur_region in sorted(list(input_regions) + bed_params.controls):
            if not cur_region.is_control():
                assert annotated_regions[i] == cur_region, str(cur_region) + ' != ' + str(annotated_regions[i]) + '(i=%d)' % i
                if annotated_regions[i].symbol != '.':
                    cur_region.set_symbol(annotated_regions[i].symbol)
                else:
                    if prev_region is None or \
                       prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"):
                        not_a_gene_count += 1
                    cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count)
                i += 1
                ambiguous_regions = [cur_region]
                while i < len(annotated_regions) and annotated_regions[i] == cur_region:  # processing duplicates
                    if annotated_regions[i].symbol != '.' and annotated_regions[i].symbol != cur_region.symbol:
                        duplicate = copy.deepcopy(cur_region)
                        duplicate.set_symbol(annotated_regions[i].symbol)
                        if duplicate.type == 'approved' and cur_region.type == 'not_approved':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                        elif annotated_regions[i].type == 'key' and cur_region.type != 'key':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                            if cnf.debug:
                                info('key gene priority over approved gene was used')
                        elif annotated_regions[i].type == cur_region.type:
                            ambiguous_regions.append(duplicate)
                    i += 1
                if len(ambiguous_regions) == 1:
                    if not prev_is_solid:
                        solid_regions.append(cur_region)
                    prev_is_solid = True
                    all_regions.append(cur_region)
                else:
                    if prev_is_solid:
                        solid_regions.append(prev_region)
                    prev_is_solid = False
                    all_regions.append(ambiguous_regions)
            else:
                all_regions.append(cur_region)
            prev_region = cur_region

        # outputting results
        cur_solid_id = -1
        for entry in all_regions:
            if isinstance(entry, list):  # list of ambiguous regions
                cur_region = entry[0]
                while cur_solid_id + 1 < len(solid_regions) and cur_region > solid_regions[cur_solid_id + 1]:
                    cur_solid_id += 1
                found = False
                if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \
                        and cur_region.chrom == solid_regions[cur_solid_id].chrom:
                    prev_solid = solid_regions[cur_solid_id]
                    for cur_region in entry:
                        if cur_region.symbol == prev_solid.symbol:
                            found = True
                            if cnf.debug:
                                info('gene name was chosen based on previous solid region')
                            break
                if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \
                        and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom:
                    next_solid = solid_regions[cur_solid_id + 1]
                    for cur_region in entry:
                        if cur_region.symbol == next_solid.symbol:
                            found = True
                            if cnf.debug:
                                info('gene name was chosen based on next solid region')
                            break
                if not found:
                    cur_region = entry[0]
            else:
                cur_region = entry
            f.write(str(cur_region) + '\n')  # automatically outputs correct number of columns and GRCh/hg names
Esempio n. 32
0
def proc_opts():
    parser = OptionParser()
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--expose-only',
                      dest='expose_to_ngs_server_only',
                      action='store_true',
                      default=False,
                      help='Only add project to the webserver')
    parser.add_option('--no-expose',
                      dest='expose',
                      action='store_false',
                      default=True,
                      help='Do not expose the reports')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option('--downsample-to', dest='downsample_to', type='int')

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) < 1:
        critical('Usage: ' + __file__ + ' *.fq.gz -o output_dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]

    fastq_fpaths = [verify_file(fpath) for fpath in args]
    fastq_fpaths = [fpath for fpath in fastq_fpaths if fpath]
    info(str(len(fastq_fpaths)) + ' fastq files')

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    info('Writing to ' + str(cnf.output_dir))

    cnf.project_name = cnf.project_name or 'preproc'

    if cnf.work_dir:
        cnf.debug = True
    else:
        all_work_dir = join(cnf.output_dir, 'work')
        safe_mkdir(all_work_dir)

        latest_fpath = join(all_work_dir, 'latest')

        if cnf.reuse_intermediate:
            cnf.work_dir = latest_fpath
        else:
            cnf.work_dir = join(
                all_work_dir,
                datetime.datetime.now().strftime("%Y-%b-%d_%H-%M"))
            if islink(latest_fpath):
                os.remove(latest_fpath)
            if isdir(latest_fpath):
                shutil.rmtree(latest_fpath)
            if not exists(latest_fpath):
                os.symlink(basename(cnf.work_dir), latest_fpath)

    cnf.work_dir = adjust_path(cnf.work_dir)
    safe_mkdir(cnf.work_dir)
    cnf.log_dir = join(cnf.work_dir, 'log')
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf)
    try:
        subprocess.call(['chmod', '-R', 'g+w', cnf.work_dir])
    except OSError:
        err(traceback.format_exc())
        pass

    if cnf.samplesheet:
        cnf.samplesheet = verify_file(cnf.samplesheet, is_critical=True)

    info(' '.join(sys.argv))
    info()
    info('Created a temporary working directory: ' + cnf.work_dir)

    if cnf.project_name:
        info('Project name: ' + cnf.project_name)

    if cnf.samplesheet:
        info('Using custom sample sheet ' + cnf.samplesheet)

    check_genome_resources(cnf)
    check_system_resources(cnf, optional=['fastq'])

    return cnf, cnf.output_dir, fastq_fpaths