Ejemplos de debug en Python, ejemplos de targqc.utilz.logger.debug en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

    def _proc_file(inp_f, out_f, ctx=None):
        max_bunch_size = 1000 * 1000
        written_lines = 0
        bunch = []

        for i, line in enumerate(inp_f):
            clean_line = line.replace('\n', '')
            if clean_line:
                if ctx:
                    new_l = proc_line_fun(clean_line, i, ctx)
                else:
                    new_l = proc_line_fun(clean_line, i)
                if new_l is not None:
                    bunch.append(new_l + '\n')
                    written_lines += 1
            else:
                bunch.append(line)
                written_lines += 1

            if len(bunch) >= max_bunch_size:
                out_f.writelines(bunch)
                debug('Written lines: ' + str(written_lines))
                bunch = []

        out_f.writelines(bunch)
        debug('Written lines: ' + str(written_lines))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None,
                 check_result=True, overwrite=False, reuse=True, ctx=None):
    assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix)
    output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix)
    if output_fpath.endswith('.gz'):
        debug('output_fpath is .gz, but writing to uncompressed.')
        output_fpath = splitext(output_fpath)[0]
    
    if not overwrite:
        if can_reuse(output_fpath, cmp_f=input_fpath):
            debug('Reusing ' + output_fpath)
            return output_fpath
        if can_reuse(output_fpath + '.gz', cmp_f=input_fpath):
            debug('Reusing ' + output_fpath + '.gz')
            return output_fpath
    
    if islink(output_fpath):
        os.unlink(output_fpath)

    debug('Writing to ' + output_fpath)
    with file_transaction(work_dir, output_fpath) as tx_fpath:
        with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f:
            if ctx:
                convert_file_fn(inp_f, out_f, ctx)
            else:
                convert_file_fn(inp_f, out_f)

    if suffix or output_fpath:
        debug('Saved to ' + output_fpath)

    verify_file(output_fpath, is_critical=check_result)
    return output_fpath

Ejemplo n.º 3

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

    def _proc_file(inp_f, out_f, ctx=None):
        max_bunch_size = 1000 * 1000
        written_lines = 0
        bunch = []

        for i, line in enumerate(inp_f):
            clean_line = line.replace('\n', '')
            if clean_line:
                if ctx:
                    new_l = proc_line_fun(clean_line, i, ctx)
                else:
                    new_l = proc_line_fun(clean_line, i)
                if new_l is not None:
                    bunch.append(new_l + '\n')
                    written_lines += 1
            else:
                bunch.append(line)
                written_lines += 1

            if len(bunch) >= max_bunch_size:
                out_f.writelines(bunch)
                debug('Written lines: ' + str(written_lines))
                bunch = []

        out_f.writelines(bunch)
        debug('Written lines: ' + str(written_lines))

Ejemplo n.º 4

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def __init__(self, work_dir, output_dir, fai_fpath, bed_fpath=None,
                 padding=None, reannotate=False, genome=None, is_debug=False):
        self.bed = None
        self.original_bed_fpath = None
        self.bed_fpath = None  # with genomic features
        self.capture_bed_fpath = None  # w/o genomic features
        self.qualimap_bed_fpath = None
        self.padded_bed_fpath = None

        self.gene_keys_set = set()  # set of pairs (gene_name, chrom)
        self.gene_keys_list = list()  # list of pairs (gene_name, chrom)
        self.regions_num = None

        self.bases_num = None
        self.fraction = None

        if bed_fpath:
            debug('Using target BED file ' + bed_fpath)
            self.is_wgs = False
            verify_bed(bed_fpath, is_critical=True)
            self.original_bed_fpath = bed_fpath
            self._make_target_bed(bed_fpath, work_dir, output_dir, padding=padding,
                is_debug=is_debug, fai_fpath=fai_fpath, genome=genome, reannotate=reannotate)
        else:
            debug('No input BED. Assuming whole genome. For region-based reports, analysing RefSeq CDS.')
            self.is_wgs = True
            self._make_wgs_regions_file(work_dir, genome=genome)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: summarize.py Proyecto: vladsaveliev/TargQC

def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath

Ejemplo n.º 6

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_wgs_regions_file(self, work_dir, genome=None):
        self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed')
        if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)):
            return self.wgs_bed_fpath

        chr_order = reference_data.get_chrom_order(genome or cfg.genome)

        r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list))
        all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True)

        debug('Select best transcript to report')
        for r in all_features:
            if r[ebl.BedCols.FEATURE] != 'gene':
                gene = r[ebl.BedCols.GENE]
                tx = r[ebl.BedCols.ENSEMBL_ID]
                r_by_tx_by_gene[gene][tx].append(r.fields)

        with file_transaction(work_dir, self.wgs_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for gname, r_by_tx in r_by_tx_by_gene.items():
                    all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript')
                    tx_sorted_list = [x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key)]
                    if not tx_sorted_list:
                        continue
                    tx_id = tx_sorted_list[0]
                    for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)):
                        out.write('\t'.join(str(f) for f in r) + '\n')
        return self.wgs_bed_fpath

Ejemplo n.º 7

0

Mostrar archivo

Archivo: generate_cds_bed.py Proyecto: vladsaveliev/TargQC

def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: bed_utils.py Proyecto: vladsaveliev/TargQC

def check_md5(work_dir, fpath, file_type, silent=False):
    md5_fpath = join(work_dir, file_type + '_md5.txt')
    new_md5 = md5(fpath)
    info('md5 of ' + fpath + ' is ' + str(new_md5))
    prev_md5 = None
    if isfile(md5_fpath):
        with open(md5_fpath) as f:
            prev_md5 = f.read()
    else:
        info('Previous md5 file ' + md5_fpath + ' does not exist')
    info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5))

    if prev_md5 == new_md5:
        if not silent:
            debug('Reusing previous ' + file_type.upper() + ' files.')
        return True
    else:
        if not silent:
            info('Pre-processing input ' + file_type.upper() + ' file')
        if prev_md5:
            if not silent:
                info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5))
                info('New ' + file_type.upper() + ' md5: ' + str(new_md5))

        with open(md5_fpath, 'w') as f:
            f.write(str(new_md5))
        return False

Ejemplo n.º 9

0

Mostrar archivo

Archivo: __init__.py Proyecto: vladsaveliev/TargQC

def get_all_features(genome, high_confidence=False, features=None, gene_names=None, only_canonical=False):
    _canon_filt = get_only_canonical_filter(genome) if only_canonical else None

    ori_genome = genome
    genome = genome.replace('GRCh37', 'hg19')
    genome = genome.replace('GRCh38', 'hg38')

    bed = _get_ensembl_file('ensembl.bed', genome)
    def _filter(x):
        if features:
            if x[BedCols.FEATURE] not in features:
                return False
        if gene_names:
            if x[BedCols.GENE] not in gene_names:
                return False
        if _canon_filt:
            if not _canon_filt(x):
                return False
        return True
    debug('Filtering BEDTool for: specific features, specific genes, canonical')
    bed = bed.filter(_filter)
    if ori_genome.startswith('GRCh'):
        def fix_chr(r):
            r.chrom = r.chrom.replace('chrM', 'MT').replace('chr', '')
            return r
        bed = bed.each(fix_chr)

    # selecting columns up to TX_OVERLAP_PERCENTAGE (to remove Hugo)
    def _select_cols(r):
        r = r[:len(BedCols.cols)-4]
        return r
    bed = bed.each(_select_cols)

    return bed

Ejemplo n.º 10

0

Mostrar archivo

def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir,
                                basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath

Ejemplo n.º 11

0

Mostrar archivo

Archivo: bed_utils.py Proyecto: vladsaveliev/TargQC

def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath

Ejemplo n.º 12

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_wgs_regions_file(self, work_dir, genome=None):
        self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed')
        if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)):
            return self.wgs_bed_fpath

        chr_order = reference_data.get_chrom_order(genome or cfg.genome)

        r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list))
        all_features = ebl.get_all_features(genome or cfg.genome,
                                            high_confidence=True)

        debug('Select best transcript to report')
        for r in all_features:
            if r[ebl.BedCols.FEATURE] != 'gene':
                gene = r[ebl.BedCols.GENE]
                tx = r[ebl.BedCols.ENSEMBL_ID]
                r_by_tx_by_gene[gene][tx].append(r.fields)

        with file_transaction(work_dir, self.wgs_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for gname, r_by_tx in r_by_tx_by_gene.items():
                    all_tx = (x for xx in r_by_tx.values() for x in xx
                              if x[ebl.BedCols.FEATURE] == 'transcript')
                    tx_sorted_list = [
                        x[ebl.BedCols.ENSEMBL_ID]
                        for x in sorted(all_tx, key=tx_priority_sort_key)
                    ]
                    if not tx_sorted_list:
                        continue
                    tx_id = tx_sorted_list[0]
                    for r in sorted(r_by_tx[tx_id],
                                    key=get_sort_key(chr_order)):
                        out.write('\t'.join(str(f) for f in r) + '\n')
        return self.wgs_bed_fpath

Ejemplo n.º 13

0

Mostrar archivo

Archivo: __init__.py Proyecto: vladsaveliev/TargQC

def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, 'One of genome or fai_fpath should be not None: ' \
                                'genome=' + str(genome) + ' fai_fpath=' + str(fai_fpath)

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths

Ejemplo n.º 14

0

Mostrar archivo

def make_region_reports(view, work_dir, samples, target, genome, depth_thresholds):
    bed_fpath = target.bed_fpath or target.wgs_bed_fpath

    if all(can_reuse(s.targqc_region_tsv, [s.bam, bed_fpath]) for s in samples):
        debug('All region reports exist, reusing')
        return [s.targqc_region_tsv for s in samples]

    info('Calculating coverage statistics for CDS and exon regions from RefSeq...')

    depth_thresholds_by_sample = dict()
    for s in samples:
        depth_thresholds_by_sample[s.name] = depth_thresholds

    debug()
    debug('Running sambamba...')
    sambamba_depth_output_fpaths = view.run(sambamba_depth,
        [[s.work_dir, bed_fpath, s.bam, depth_thresholds_by_sample[s.name], None, s.name]
         for s in samples])
    assert len(sambamba_depth_output_fpaths) == len(samples), \
        'Number of sambamba results = ' + str(len(sambamba_depth_output_fpaths)) + \
        ' which is less then the number of samples ' + str(len(samples))

    debug()
    debug('Parsing sambamba results and writing results...')
    view.run(_proc_sambamba_depth,
        [[sambamba_output_fpath, s.targqc_region_tsv, s.name, depth_thresholds_by_sample[s.name]]
         for sambamba_output_fpath, s in zip(sambamba_depth_output_fpaths, samples)])

    info('Done.')
    return [s.targqc_region_tsv for s in samples]

Ejemplo n.º 15

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def safe_symlink_to(fpath, dst_dirpath):
    dst = join(dst_dirpath, basename(fpath))
    if not exists(dst):
        try:
            if os.lstat(dst):  # broken symlink
                os.remove(dst)
        except OSError:
            pass
        debug('Symlink ' + fpath + ' -> ' + dst)
        os.symlink(fpath, dst)
    return dst

Ejemplo n.º 16

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def safe_symlink_to(fpath, dst_dirpath):
    dst = join(dst_dirpath, basename(fpath))
    if not exists(dst):
        try:
            if os.lstat(dst):  # broken symlink
                os.remove(dst)
        except OSError:
            pass
        debug('Symlink ' + fpath + ' -> ' + dst)
        os.symlink(fpath, dst)
    return dst

Ejemplo n.º 17

0

Mostrar archivo

def run_qualimap(work_dir,
                 output_dir,
                 output_fpaths,
                 bam_fpath,
                 genome,
                 bed_fpath=None,
                 threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() +
               ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
               '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(
            can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(
            verify_file(
                fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir

Ejemplo n.º 18

0

Mostrar archivo

Archivo: __init__.py Proyecto: vladsaveliev/TargQC

def get_merged_cds(genome):
    """
    Returns all CDS merged, used:
    - for TargQC general reports CDS coverage statistics for WGS
    - for Seq2C CNV calling when no capture BED available
    """
    bed = get_all_features(genome)
    debug('Filtering BEDTool for high confidence CDS and stop codons')
    return bed\
        .filter(lambda r: r.fields[BedCols.FEATURE] in ['CDS', 'stop_codon'])\
        .filter(high_confidence_filter)\
        .merge()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def can_reuse(fpath, cmp_f, silent=False):
    do_reuse = os.environ.get('REUSE', '1')
    if do_reuse == '0':
        return False
    if not fpath or not isfile(fpath):
        return False
    elif verify_file(fpath, cmp_f=cmp_f, silent=True):
        if not silent:
            debug('Reusing ' + fpath)
        return True
    else:
        return False

Ejemplo n.º 20

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def can_reuse(fpath, cmp_f, silent=False):
    do_reuse = os.environ.get('REUSE', '1')
    if do_reuse == '0':
        return False
    if not fpath or not isfile(fpath):
        return False
    elif verify_file(fpath, cmp_f=cmp_f, silent=True):
        if not silent:
            debug('Reusing ' + fpath)
        return True
    else:
        return False

Ejemplo n.º 21

0

Mostrar archivo

Archivo: parallel.py Proyecto: vladsaveliev/TargQC

def get_parallel_view(n_samples, parallel_cfg):
    if parallel_cfg.scheduler and parallel_cfg.threads > 1:
        debug('Starting' + (' test' if not is_cluster() else '') +
              ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' +
              parallel_cfg.queue + ') '
              'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' +
              str(parallel_cfg.cores_per_job(n_samples)) +
              ' threads per each sample')
        return ClusterView(n_samples, parallel_cfg)
    else:
        debug('Running locally using ' +
              str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)')
        return ThreadedView(n_samples, parallel_cfg)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: bed_utils.py Proyecto: vladsaveliev/TargQC

def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath

Ejemplo n.º 23

0

Mostrar archivo

Archivo: bed_utils.py Proyecto: vladsaveliev/TargQC

def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath

Ejemplo n.º 24

0

Mostrar archivo

Archivo: proc_args.py Proyecto: vladsaveliev/TargQC

def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None):
    """ Creates output_dir, work_dir, and sets up log
    """
    output_dir = safe_mkdir(
        adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir')
    debug('Saving results into ' + output_dir)

    work_dir = safe_mkdir(work_dir or join(output_dir, 'work'),
                          'working directory')
    info('Using work directory ' + work_dir)

    log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')),
                           proc_name + '.log')

    return output_dir, work_dir, log_fpath

Ejemplo n.º 25

0

Mostrar archivo

Archivo: runner.py Proyecto: vladsaveliev/TargQC

def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    individual_report_fpaths = [s.qualimap_html_fpath for s in samples]
    if isdir(plots_dirpath) and not any(
            not can_reuse(join(plots_dirpath, f), individual_report_fpaths)
            for f in listdir(plots_dirpath) if not f.startswith('.')):
        debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0:
            if find_executable() is not None:  # and get_qualimap_type(find_executable()) == 'full':
                qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(samples)
                _correct_qualimap_insert_size_histogram(samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport')
                cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals())
                run(cmdline, env_vars=dict(DISPLAY=None),
                    checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate)

                if not verify_dir(qualimap_plots_dirpath):
                    warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.')
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.')
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))

Ejemplo n.º 26

0

Mostrar archivo

Archivo: gtf.py Proyecto: vladsaveliev/TargQC

def get_gtf_db(gtf, in_memory=False):
    """
    create a gffutils DB
    """
    db_file = gtf + '.db'
    if gtf.endswith('.gz'):
        db_file = gtf[:-3] + '.db'
    if file_exists(db_file):
        return gffutils.FeatureDB(db_file)
    db_file = ':memory:' if in_memory else db_file
    if in_memory or not file_exists(db_file):
        debug('GTF database does not exist, creating...')
        infer_extent = guess_infer_extent(gtf)
        db = gffutils.create_db(gtf, dbfn=db_file,
                                infer_gene_extent=infer_extent)
        return db
    else:
        return gffutils.FeatureDB(db_file)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath

Ejemplo n.º 28

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def __init__(self,
                 work_dir,
                 output_dir,
                 fai_fpath,
                 bed_fpath=None,
                 padding=None,
                 reannotate=False,
                 genome=None,
                 is_debug=False):
        self.bed = None
        self.original_bed_fpath = None
        self.bed_fpath = None  # with genomic features
        self.capture_bed_fpath = None  # w/o genomic features
        self.qualimap_bed_fpath = None
        self.padded_bed_fpath = None

        self.gene_keys_set = set()  # set of pairs (gene_name, chrom)
        self.gene_keys_list = list()  # list of pairs (gene_name, chrom)
        self.regions_num = None

        self.bases_num = None
        self.fraction = None

        if bed_fpath:
            debug('Using target BED file ' + bed_fpath)
            self.is_wgs = False
            verify_bed(bed_fpath, is_critical=True)
            self.original_bed_fpath = bed_fpath
            self._make_target_bed(bed_fpath,
                                  work_dir,
                                  output_dir,
                                  padding=padding,
                                  is_debug=is_debug,
                                  fai_fpath=fai_fpath,
                                  genome=genome,
                                  reannotate=reannotate)
        else:
            debug(
                'No input BED. Assuming whole genome. For region-based reports, analysing RefSeq CDS.'
            )
            self.is_wgs = True
            self._make_wgs_regions_file(work_dir, genome=genome)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: runner.py Proyecto: vladsaveliev/TargQC

def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
        '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir

Ejemplo n.º 30

0

Mostrar archivo

Archivo: proc_args.py Proyecto: vladsaveliev/TargQC

def set_up_log(log_dir, log_fname):
    log_fname = log_fname
    log_fpath = join(log_dir, log_fname)

    if file_exists(log_fpath):
        timestamp = datetime.datetime.fromtimestamp(
            os.stat(log_fpath).st_mtime)
        mv_log_fpath = log_fpath + '.' + timestamp.strftime(
            '%Y-%m-%d_%H-%M-%S_' + str(random() * 1000))
        try:
            if isfile(mv_log_fpath):
                os.remove(mv_log_fpath)
            if not isfile(mv_log_fpath):
                os.rename(log_fpath, mv_log_fpath)
        except OSError:
            pass

    logger.set_log_path(log_fpath)
    debug('Logging to ' + log_fpath)
    debug()
    return log_fpath

Ejemplo n.º 31

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir,
                                                     self.capture_bed_fpath,
                                                     'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath

Ejemplo n.º 32

0

Mostrar archivo

Archivo: file_utils.py Proyecto: vladsaveliev/TargQC

def convert_file(work_dir,
                 input_fpath,
                 convert_file_fn,
                 suffix=None,
                 output_fpath=None,
                 check_result=True,
                 overwrite=False,
                 reuse=True,
                 ctx=None):
    assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix)
    output_fpath = output_fpath or intermediate_fname(
        work_dir, input_fpath, suf=suffix)
    if output_fpath.endswith('.gz'):
        debug('output_fpath is .gz, but writing to uncompressed.')
        output_fpath = splitext(output_fpath)[0]

    if not overwrite:
        if can_reuse(output_fpath, cmp_f=input_fpath):
            debug('Reusing ' + output_fpath)
            return output_fpath
        if can_reuse(output_fpath + '.gz', cmp_f=input_fpath):
            debug('Reusing ' + output_fpath + '.gz')
            return output_fpath

    if islink(output_fpath):
        os.unlink(output_fpath)

    debug('Writing to ' + output_fpath)
    with file_transaction(work_dir, output_fpath) as tx_fpath:
        with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f:
            if ctx:
                convert_file_fn(inp_f, out_f, ctx)
            else:
                convert_file_fn(inp_f, out_f)

    if suffix or output_fpath:
        debug('Saved to ' + output_fpath)

    verify_file(output_fpath, is_critical=check_result)
    return output_fpath

Ejemplo n.º 33

0

Mostrar archivo

Archivo: general_report.py Proyecto: vladsaveliev/TargQC

def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding,
                         num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None):
    if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam)
               for fp in _qualimap_outputs(s))
           for s in samples):
        debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing')
    else:
        info('Running QualiMap...')
        view.run(runner.run_qualimap,
            [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job]
             for s in samples])

        for s in samples:
            for fp in _qualimap_outputs(s):
                verify_file(fp, is_critical=True)

    summary_reports = []

    for sample in samples:
        info('-'*70)
        info(sample.name)
        debug('-'*70)
        debug('Parsing QualiMap results...')
        depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample)

        _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats,
                          target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath)

        r = _build_report(depth_stats, reads_stats, indels_stats, sample, target,
                          depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug,
                          reannotate=reannotate)
        summary_reports.append(r)

    return summary_reports

Ejemplo n.º 34

0

Mostrar archivo

Archivo: __init__.py Proyecto: vladsaveliev/TargQC

def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path

Ejemplo n.º 35

0

Mostrar archivo

Archivo: bed_utils.py Proyecto: vladsaveliev/TargQC

def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath

Ejemplo n.º 36

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)

Ejemplo n.º 37

0

Mostrar archivo

Archivo: bed_annotation.py Proyecto: vladsaveliev/TargQC

def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num,
              high_confidence=False, reannotate=False, is_debug=False, **kwargs):
    # if genome:
        # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None
    
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
        else:
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        intersection_bed.saveas(intersection_fpath)
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
            critical(f'Cannot parse the reference BED file - unexpected number of lines '
                     '({len(inters_fields_list} in {inters_fields_list}' +
                     ' (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ebl.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)

        else:
            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ebl.BedCols.GENE]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)
            else:
                transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs)

    return annotated

Ejemplo n.º 38

0

Mostrar archivo

Archivo: bed_annotation.py Proyecto: vladsaveliev/TargQC

def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
        # if reannotate:
        #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
        #     keep_gene_column = False
        # else:
        #     if col_num > 4:
        #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
        #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
        x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath

Ejemplo n.º 39

0

Mostrar archivo

def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex

Ejemplo n.º 40

0

Mostrar archivo

def proc_fastq(samples,
               parall_view,
               work_dir,
               bwa_prefix,
               downsample_to,
               num_pairs_by_sample=None,
               dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(
                can_reuse(make_pair_counts_fpath(join(work_dir, s.name)),
                          s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {
                s.name: int(
                    open(make_pair_counts_fpath(join(work_dir,
                                                     s.name))).read().strip())
                for s in samples
            }
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(
                count_read_pairs,
                [[s.name,
                  safe_mkdir(join(work_dir, s.name)), s.l_fpath]
                 for s in samples])
            num_pairs_by_sample = {
                s.name: pairs_count
                for s, pairs_count in zip(samples, num_pairs)
            }

        # Downsampling
        debug()
        if all(
                can_reuse(
                    make_downsampled_fpath(join(work_dir, s.name), s.l_fpath),
                    s.l_fpath) and can_reuse(
                        make_downsampled_fpath(join(work_dir, s.name),
                                               s.r_fpath), s.r_fpath)
                for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) +
                     ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) +
                     ' read pairs')
            fastq_pairs = parall_view.run(downsample, [[
                join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath,
                downsample_to,
                num_pairs_by_sample.get(s.name)
            ] for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(
            can_reuse(make_bam_fpath(join(work_dir, s.name)),
                      [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:
                err('Error: bwa is required for the alignment pipeline')
            if not smb:
                err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align, [[
            join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb,
            bwa_prefix, dedup, parall_view.cores_per_job
        ] for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample

Ejemplo n.º 41

0

Mostrar archivo

Archivo: parallel.py Proyecto: vladsaveliev/TargQC

 def __init__(self, n_samples, parallel_cfg):
     BaseView.__init__(self, n_samples, parallel_cfg)
     from cluster_helper.cluster import ClusterView as CV
     self._view = CV(**parallel_cfg.get_cluster_params(n_samples))
     debug('Starting cluster with ' + str(self.num_jobs) + ' open nodes, ' +
           str(self.cores_per_job) + ' cores per node')

Ejemplo n.º 42

0

Mostrar archivo

Archivo: sex.py Proyecto: vladsaveliev/TargQC

def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex

Ejemplo n.º 43

0

Mostrar archivo

Archivo: parallel.py Proyecto: vladsaveliev/TargQC

 def run(self, fn, param_lists):
     debug('Starting multithreaded function' + str(fn))
     assert self.n_samples == len(param_lists)
     return self._view(delayed(fn)(*params) for params in param_lists)

Ejemplo n.º 44

0

Mostrar archivo

Archivo: region_coverage.py Proyecto: vladsaveliev/TargQC

def _proc_sambamba_depth(sambamba_depth_output_fpath, output_fpath, sample_name, depth_thresholds):
    read_count_col = None
    mean_cov_col = None
    median_cov_col = None
    min_depth_col = None
    std_dev_col = None
    wn_20_pcnt_col = None

    regions_by_genekey = defaultdict(list)
    #####################################
    #####################################
    if can_reuse(output_fpath, sambamba_depth_output_fpath):
        return output_fpath

    debug('Reading coverage statistics and writing regions to ' + output_fpath)

    def write_line(f, fields):
        f.write('\t'.join(fields) + '\n')

    with file_transaction(None, output_fpath) as tx:
        with open(sambamba_depth_output_fpath) as sambabma_depth_file, open(tx, 'w') as out:
            total_regions_count = 0
            for line in sambabma_depth_file:
                fs = line.strip('\n').split('\t')
                if line.startswith('#'):
                    fs = line.split('\t')
                    read_count_col = fs.index('readCount') + 1
                    mean_cov_col = fs.index('meanCoverage') + 1
                    #median_cov_col = fs.index('medianCoverage') if 'medianCoverage' in fs else None
                    #min_depth_col = fs.index('minDepth') if 'minDepth' in fs else None
                    #std_dev_col = fs.index('stdDev') if 'stdDev' in fs else None
                    #wn_20_pcnt_col = fs.index('percentWithin20PercentOfMedian') if 'percentWithin20PercentOfMedian' in fs else None

                    write_line(out, [
                        'chrom',
                        'start',
                        'end',
                        'size',
                        'gene',
                        'exon',
                        'strand',
                        'feature',
                        'biotype',
                        'transcript',
                        'trx_overlap',
                        'exome_overlap',
                        'cds_overlap',
                        # 'min_depth',
                        'avg_depth',
                        # 'median_depth',
                        # 'std_dev',
                        # 'within_20pct_of_median',
                    ] + ['at{}x'.format(ths) for ths in depth_thresholds])
                    continue

                chrom = fs[0]
                start, end = int(fs[1]), int(fs[2])
                region_size = end - start
                gene_name = fs[ebl.BedCols.GENE] if read_count_col != ebl.BedCols.GENE else '.'
                exon = fs[ebl.BedCols.EXON]
                strand = fs[ebl.BedCols.STRAND]
                feature = fs[ebl.BedCols.FEATURE]
                biotype = fs[ebl.BedCols.BIOTYPE]
                transcript = fs[ebl.BedCols.ENSEMBL_ID]
                transcript_overlap = fs[ebl.BedCols.TX_OVERLAP_PERCENTAGE]
                exome_overlap = fs[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE]
                cds_overlap = fs[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE]
                avg_depth = float(fs[mean_cov_col])
                # min_depth = int(fs[min_depth_col]) if min_depth_col is not None else '.'
                # std_dev = float(fs[std_dev_col]) if std_dev_col is not None else '.'
                # median_depth = int(fs[median_cov_col]) if median_cov_col is not None else '.'
                # rate_within_normal = float(fs[wn_20_pcnt_col]) if wn_20_pcnt_col is not None else '.'
                last_cov_col = max(mean_cov_col or 0, median_cov_col or 0, std_dev_col or 0, wn_20_pcnt_col or 0)
                rates_within_threshs = fs[last_cov_col+1:-1]

                write_line(out, [str(v) if v not in ['', None, '.'] else '.' for v in [
                        chrom,
                        start,
                        end,
                        region_size,
                        gene_name,
                        exon,
                        strand,
                        feature,
                        biotype,
                        transcript,
                        ((transcript_overlap + '%') if transcript_overlap not in ['', None, '.'] else '.'),
                        ((exome_overlap + '%') if exome_overlap not in ['', None, '.'] else '.'),
                        ((cds_overlap + '%') if cds_overlap not in ['', None, '.'] else '.'),
                        # min_depth,
                        avg_depth,
                        # median_depth,
                        # std_dev,
                        # rate_within_normal,
                    ] + rates_within_threshs])

                total_regions_count += 1
                if total_regions_count > 0 and total_regions_count % 10000 == 0:
                    debug('  Processed {0:,} regions'.format(total_regions_count))
        debug('Total regions: ' + str(len(regions_by_genekey)))
    return output_fpath

Ejemplo n.º 45

0

Mostrar archivo

Archivo: Target.py Proyecto: vladsaveliev/TargQC

    def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
                         padding=None, fai_fpath=None, genome=None, reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
                    debug('Annotating target BED file and collecting overlapping genome features')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)
        
        self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)

Ejemplo n.º 46

0

Mostrar archivo

Archivo: fastq.py Proyecto: vladsaveliev/TargQC

def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples}
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples])
            num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)}

        # Downsampling
        debug()
        if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and
               can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs')
            fastq_pairs = parall_view.run(downsample,
                [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)]
                 for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:         err('Error: bwa is required for the alignment pipeline')
            if not smb:         err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align,
            [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job]
             for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample

Ejemplo n.º 47

0

Mostrar archivo

Archivo: generate_ensembl_data.py Proyecto: vladsaveliev/Utils

def main():
    description = '''
Usage:
    ' + __file__ + ' hg19 [db.gtf]
'''

    options = [
        (['--debug'], dict(dest='debug', action='store_true', default=False)),
    ]
    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')
    logger.is_debug = opts.debug

    genome_name = args[0]

    if len(args) > 1:
        gtf_fpath = args[1]
    else:
        gtf_fpath = ebl.ensembl_gtf_fpath(genome_name)
    if not isfile(gtf_fpath):
        if not gtf_fpath.endswith('.gz'):
            gtf_fpath += '.gz'
    gtf_fpath = verify_file(gtf_fpath)
    debug('Reading the GTF database')
    db = gtf.get_gtf_db(gtf_fpath)

    debug('Reading biomart data')
    features_by_ens_id = read_biomart(genome_name)

    chroms = [c for c, l in ref.get_chrom_lengths(genome_name)]
    
    output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed')
    unsorted_output_fpath = add_suffix(output_fpath, 'unsorted')
    debug('Processing features, writing to ' + unsorted_output_fpath)

    def _get(_rec, _key):
        val = _rec.attributes.get(_key)
        if val is None:
            return None
        assert len(val) == 1, (_key, str(val))
        return val[0]

    num_tx_not_in_biomart = 0
    num_tx_diff_gene_in_biomart = 0
    with open(unsorted_output_fpath, 'w') as out:
        out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n')

        for rec in db.all_features(order_by=('seqid', 'start', 'end')):
            if rec.featuretype == 'gene': continue
            if rec.chrom not in chroms: continue
            if rec.end - rec.start < 0: continue
            
            tx_id = _get(rec, 'transcript_id')
            gname = _get(rec, 'gene_name')
            tx_biotype = _get(rec, 'transcript_biotype')
            if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype')
            tsl = _get(rec, 'transcript_support_level')
            hugo_gene = None

            biomart_rec = features_by_ens_id.get(tx_id)
            if not biomart_rec:
                if rec.featuretype == 'transcript':
                    num_tx_not_in_biomart += 1
            else:
                bm_gname = biomart_rec['Associated Gene Name']
                bm_tx_biotype = biomart_rec['Transcript type']
                bm_tsl = biomart_rec.get('Transcript Support Level (TSL)')
                hugo_gene = biomart_rec['HGNC symbol']
                if bm_gname != gname:
                    if rec.featuretype == 'transcript':
                        num_tx_diff_gene_in_biomart += 1
                    continue
                tx_biotype = bm_tx_biotype
                tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None

            fs = [None] * len(ebl.BedCols.cols[:-3])
            if not rec.chrom.startswith('chr'):
                rec.chrom = 'chr' + rec.chrom.replace('MT', 'M')
            fs[:6] = [rec.chrom,
                      str(rec.start - 1),
                      str(rec.end),
                      gname,
                      rec.attributes.get('exon_number', ['.'])[0],
                      rec.strand]
            fs[ebl.BedCols.FEATURE] = rec.featuretype or '.'
            fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.'
            fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.'
            # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.'
            # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else ''
            fs[ebl.BedCols.TSL] = tsl or '.'
            fs[ebl.BedCols.HUGO] = hugo_gene or '.'
            # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc
            out.write('\t'.join(fs) + '\n')

    if num_tx_not_in_biomart:
        warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart')
    if num_tx_diff_gene_in_biomart:
        warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart')

    debug('Sorting results')
    sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name)
    os.remove(unsorted_output_fpath)
    bgzip_and_tabix(output_fpath)