コード例 #1
0
def _resolve_ambiguities(annotated_by_loc_by_gene, chrom_order):
    annotated = []
    for (chrom, start,
         end), overlaps_by_gene in annotated_by_loc_by_gene.iteritems():
        for g_name, overlaps in overlaps_by_gene.iteritems():
            consensus = Region(chrom,
                               start,
                               end,
                               ref_chrom_order=chrom_order.get(chrom),
                               gene_symbol=g_name,
                               exon='',
                               strand='',
                               feature='',
                               biotype='')
            for r, overlap_size in overlaps:
                if consensus.strand:
                    # RefSeq has exons from different strands with the same gene name (e.g. CTAGE4 for hg19),
                    # Such pair of exons may overlap with a single region, so taking strand from the first one
                    if consensus.strand != r.strand:
                        warn(
                            'Warning: different strands between consensus and next region (gene: '
                            + g_name + ')')
                    #assert consensus.strand == r.strand, 'Consensus strand is ' + \
                    #     consensus.strand + ', region strand is ' + r.strand
                else:
                    consensus.strand = r.strand
                consensus.exon = merge_fields(consensus.exon, r.exon)
                consensus.feature = merge_fields(consensus.feature, r.feature)
                consensus.biotype = merge_fields(consensus.biotype, r.biotype)
                consensus.total_merged += 1

            annotated.append(consensus)

    return annotated
コード例 #2
0
def check_genome_resources(cnf):
    if cnf.genome is None:
        critical('Please, specify genome build (one of available in ' +
                 cnf.sys_cnf +
                 ') using the --genome option (e.g., --genome hg38).')

    if not cnf.genomes:
        critical('"genomes" section is not specified in system config ' +
                 cnf.sys_cnf)

    info('Genome: ' + str(cnf.genome.name))

    for key in cnf.genome.keys():
        if key != 'name' and isinstance(cnf.genome[key], basestring):
            cnf.genome[key] = adjust_system_path(cnf.genome[key])

            if not verify_obj_by_path(cnf.genome[key], key, silent=True):
                if not cnf.genome[key].endswith('.gz') and verify_file(
                        cnf.genome[key] + '.gz', silent=True):
                    gz_fpath = cnf.genome[key] + '.gz'
                    if verify_file(gz_fpath, silent=True):
                        cnf.genome[key] = gz_fpath

    if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds:
        warn(
            'Warning: features and bed_annotation_features and cds in the system config ('
            + cnf.sys_cnf + ') must be specified.')

    if not cnf.transcripts_fpath:
        cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts(
            cnf.genome.name, ensembl=True)
コード例 #3
0
def verify_vcf(vcf_fpath, silent=False, is_critical=False):
    if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical):
        return None
    debug('File ' + vcf_fpath + ' exists and not empty')
    vcf = open_gzipsafe(vcf_fpath)
    debug('File ' + vcf_fpath + ' opened')
    l = next(vcf, None)
    if l is None:
        (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath)
        return None
    if not l.startswith('##fileformat=VCF'):
        (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath)
        return None

    try:
        reader = vcf_parser.Reader(vcf)
    except:
        err('Error: cannot open the VCF file ' + vcf_fpath)
        if is_critical: raise
    else:
        debug('File ' + vcf_fpath + ' opened as VCF')
        try:
            rec = next(reader)
        except IndexError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('IndexError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except ValueError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('ValueError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except StopIteration:
            debug('No records in the VCF file ' + vcf_fpath)
            if not silent:
                warn('VCF file ' + vcf_fpath + ' has no records.')
            return vcf_fpath
        except:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('Other error parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        else:
            debug('A record was read from the VCF file ' + vcf_fpath)
            return vcf_fpath
        # f = open_gzipsafe(output_fpath)
        # l = f.readline()
        # if 'Cannot allocate memory' in l:
        #     f.close()
        #     f = open_gzipsafe(output_fpath)
        #     contents = f.read()
        #     if not silent:
        #         if is_critical:
        #             critical('SnpSift failed with memory issue:\n' + contents)
        #         else:
        #             err('SnpSift failed with memory issue:\n' + contents)
        #             return None
        #     f.close()
        #     return None
        # return output_fpath
    finally:
        vcf.close()
コード例 #4
0
def finalize_one(cnf, qc_report_fpath, qc_plots_fpaths):
    if qc_report_fpath:
        info('Saved QC report to ' + qc_report_fpath)
    if qc_plots_fpaths:
        info('Saved QC plots are in: ' + ', '.join(qc_plots_fpaths))
    elif not verify_module('matplotlib'):
        warn('Warning: QC plots were not generated because matplotlib is not installed.')
コード例 #5
0
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names):
    fastq_by_sn = OrderedDict()

    for sn in sample_names:
        sn_fastq_fpaths = sorted(
            [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')])
        if len(sn_fastq_fpaths) == 0:
            err('Error: no fastq found for ' + sn)
            fastq_by_sn[sn] = None
        elif len(sn_fastq_fpaths) > 2:
            critical('Error: more than 2 fastq files starting with ' + sn +
                     '_R: ' + ', '.join(sn_fastq_fpaths))
        elif len(sn_fastq_fpaths) == 1:
            warn('Warning: only single fastq file is found for ' + sn +
                 '. Treating as single reads.')
            fastq_by_sn[sn] = [
                verify_file(sn_fastq_fpaths[0],
                            description='sn_fastq_fpaths[0] for ' + str(sn)),
                None
            ]
        else:
            fastq_by_sn[sn] = [
                verify_file(fpath,
                            description='fpath from sn_fastq_fpaths for ' +
                            str(sn)) for fpath in sn_fastq_fpaths
            ]

    return fastq_by_sn
コード例 #6
0
def parse_variants(fpath):
    sample_column_name = 'Sample'
    gene_column_name = 'Gene'

    genes_per_sample = dict()
    with open(fpath) as f:
        header = f.readline().split('\t')
        if sample_column_name not in header:
            warn('"' + sample_column_name + '" is not found in ' + fpath +
                 ' header, skipping this file!')
            return genes_per_sample
        else:
            sample_column_id = header.index(sample_column_name)
        if gene_column_name not in header:
            warn('"' + gene_column_name + '" is not found in ' + fpath +
                 ' header, skipping this file!')
            return genes_per_sample
        else:
            gene_column_id = header.index(gene_column_name)
        for line in f:
            line = line.split('\t')
            sample = line[sample_column_id]
            gene = line[gene_column_id]
            if sample not in genes_per_sample:
                genes_per_sample[sample] = set()
            genes_per_sample[sample].add(gene)
    info('Found info for %d samples:' % len(genes_per_sample))
    for k, v in genes_per_sample.items():
        info('\t%s (%d unique genes)' % (k, len(v)))
    return genes_per_sample
コード例 #7
0
def merge_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath):
    if cnf.reuse_intermediate and isfile(
            combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath +
                                                       '.gz'):
        info(combined_vcf_fpath + '.gz exists, reusing')
        return combined_vcf_fpath + '.gz'

    bcftools = get_system_path(cnf, 'bcftools')
    if not bcftools:
        info('bcftools is not found, skipping merging VCFs')
        return None

    cmdl = '{bcftools} merge --force-samples '.format(**locals())
    for sample, vcf_fpath in vcf_fpath_by_sname.iteritems():
        if vcf_fpath:
            cmdl += ' ' + vcf_fpath + ' '
    cmdl += ' -o ' + combined_vcf_fpath

    res = call(cnf,
               cmdl,
               output_fpath=combined_vcf_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        info('Joined VCFs, saved into ' + combined_vcf_fpath)
        if isfile(combined_vcf_fpath + '.tx.idx'):
            try:
                os.remove(combined_vcf_fpath + '.tx.idx')
            except OSError:
                info()
        return bgzip_and_tabix(combined_vcf_fpath)
    else:
        warn('Could not join VCFs')
        return None
コード例 #8
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        info('Parsing the NextSeq500 project structure')
        self.kind = 'nextseq500'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)
        info('az_prjname_by_subprj: ' + str(az_prjname_by_subprj))

        verify_dir(self.unaligned_dirpath, is_critical=True)

        for pname, project in self.project_by_name.items():
            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(self.unaligned_dirpath, az_proj_name)
            for sample in project.sample_by_name.values():
                sample.source_fastq_dirpath = project.dirpath
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        self.get_fastq_regexp_fn = get_nextseq500_regexp
コード例 #9
0
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if not bammarkduplicates:
        bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
        if not bammarkduplicates:
            warn('No biobambam bammarkduplicates, can\'t mark duplicates.')
            return None

    out_bam_fpath = add_suffix(in_bam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_bam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = (
        '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}'
    ).format(**locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_bam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_bam_fpath
    else:
        return None
コード例 #10
0
def tmpdir():
    dirpath = make_tmpdir()
    try:
        yield dirpath
    finally:
        try:
            shutil.rmtree(dirpath)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + dirpath)
コード例 #11
0
 def __find_unaligned_dir(self):
     unaligned_dirpath = join(self.dirpath, 'Unalign')
     if verify_dir(unaligned_dirpath,
                   description='"Unalign" directory',
                   silent=True):
         unaligned_dirpath = unaligned_dirpath
     else:
         unaligned_dirpath = None
         warn('No unalign directory')
     return unaligned_dirpath
コード例 #12
0
def workdir(cnf):
    if cnf.work_dir:
        verify_dir(cnf.work_dir, is_critical=True)
        yield cnf.work_dir
    else:
        cnf.work_dir = make_tmpdir()
        yield cnf.work_dir
        try:
            shutil.rmtree(cnf.work_dir)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
コード例 #13
0
    def __init__(self, dirpath, az_prjname_by_subprj=None, samplesheet=None):
        info('Parsing the HiSeq project structure')
        self.kind = 'hiseq'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)

        verify_dir(self.unaligned_dirpath, is_critical=True)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        for pname, project in self.project_by_name.items():
            proj_dirpath = join(
                self.unaligned_dirpath, 'Project_' + pname.replace(
                    ' ', '-'))  #.replace('-', '_').replace('.', '_'))

            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(proj_dirpath, az_proj_name)
            for sname, sample in project.sample_by_name.items():
                sample.source_fastq_dirpath = join(
                    project.dirpath, 'Sample_' + sname.replace(
                        ' ', '-'))  #.replace('-', '_').replace('.', '_'))
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

            basecalls_symlink = join(project.dirpath, 'BaseCallsReports')
            if not exists(basecalls_symlink):
                info('Creating BaseCalls symlink ' + self.basecalls_dirpath +
                     ' -> ' + basecalls_symlink)
                try:
                    os.symlink(self.basecalls_dirpath, basecalls_symlink)
                except OSError:
                    err('Cannot create symlink')
                    traceback.print_exc()
                else:
                    info('Created')
            if exists(basecalls_symlink):
                self.basecalls_dirpath = basecalls_symlink

        self.get_fastq_regexp_fn = get_hiseq_regexp
コード例 #14
0
def get_regions_coverage(cnf, samples):
    cov_thresholds = [1, 5, 10, 15, 20, 25, 30, 50, 100]
    depths_by_pos = defaultdict(lambda: [0] * len(samples))
    info()
    info('Coverage to bedgraph for ' + cnf.chrom)
    coverage_fpaths = []
    for index, sample in enumerate(samples):
        coverage_fpath = join(cnf.work_dir,
                              sample.name + '_' + cnf.chrom + '.bedgraph')
        coverage_fpath = get_bedgraph_coverage(cnf,
                                               sample.bam,
                                               chr_len_fpath=cnf.chr_len_fpath,
                                               bed_fpath=cnf.bed,
                                               output_fpath=coverage_fpath,
                                               exit_on_error=False)
        if coverage_fpath and verify_file(coverage_fpath):
            coverage_fpaths.append(coverage_fpath)
            for line in open(coverage_fpath):
                if line.startswith('#'):
                    continue
                chrom, start, end, depth = line.split('\t')
                start, end, depth = map(int, (start, end, depth))
                for pos in xrange(start, end):
                    depths_by_pos[pos][index] = depth

    info()
    if not coverage_fpaths:
        warn(cnf.chrom + ' is not covered in all samples')
        return None

    info()
    info('Writing coverage for ' + cnf.chrom)
    write_coverage(cnf, cnf.output_dir, cnf.chrom, depths_by_pos,
                   cov_thresholds)
    for index, sample in enumerate(samples):
        info('Writing coverage for ' + sample.name + ', ' + chrom)
        sample_output_dirpath = join(cnf.output_dir, sample.name)
        output_fpath = join(sample_output_dirpath, chrom + '.txt.gz')
        if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            continue
        write_coverage(cnf,
                       sample_output_dirpath,
                       cnf.chrom,
                       depths_by_pos,
                       cov_thresholds,
                       sample_index=index)
        if not verify_file(output_fpath, silent=True):
            warn(sample.name + ' has no coverage at chromosome ' + chrom)
    return depths_by_pos
コード例 #15
0
def create_jbrowse_symlink(genome, project_name, sample, file_fpath):
    jbrowse_data_path, _, _ = set_folders(genome)
    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath, project_name)
    base, ext = splitext_plus(file_fpath)
    if ext in ['.tbi', '.bai']:
        base, ext2 = splitext_plus(base)
        ext = ext2 + ext
    sym_link = join(jbrowse_project_dirpath, sample + ext)
    if not verify_dir(jbrowse_project_dirpath):
        safe_mkdir(jbrowse_project_dirpath)
    if isfile(file_fpath) and not isfile(sym_link):
        try:
            os.symlink(file_fpath, sym_link)
        except OSError:
            warn(traceback.format_exc())
    if isfile(sym_link):
        change_permissions(sym_link)
    return sym_link
コード例 #16
0
def combine_vcfs(cnf,
                 vcf_fpath_by_sname,
                 combined_vcf_fpath,
                 additional_parameters=''):
    gatk = get_java_tool_cmdline(cnf, 'gatk')
    if not gatk:
        info('GATK is not found, skipping merging VCFs')
        return None

    cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format(
        **locals())
    for s_name, vcf_fpath in vcf_fpath_by_sname.items():
        if vcf_fpath:
            cmdl += ' --variant:' + s_name + ' ' + vcf_fpath
    if ' --variant:' not in cmdl:
        err('No VCFs to combine')
        return None

    if cnf.reuse_intermediate and isfile(
            combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath +
                                                       '.gz'):
        info(combined_vcf_fpath + '.gz exists, reusing')
        return combined_vcf_fpath + '.gz'

    cmdl += ' -o ' + combined_vcf_fpath
    res = call(cnf,
               cmdl,
               output_fpath=combined_vcf_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        info('Joined VCFs, saved into ' + combined_vcf_fpath)
        if isfile(combined_vcf_fpath + '.tx.idx'):
            try:
                os.remove(combined_vcf_fpath + '.tx.idx')
            except OSError:
                err(traceback.format_exc())
                info()
        return bgzip_and_tabix(cnf, combined_vcf_fpath)
    else:
        warn('Could not join VCFs')
        return None
コード例 #17
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        info('Parsing the MiSeq project structure')
        self.kind = 'miseq'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)

        base_dirpath = self.unaligned_dirpath
        if not verify_dir(base_dirpath, silent=True):
            base_dirpath = self.basecalls_dirpath
        verify_dir(base_dirpath, description='Source fastq dir')

        for pname, project in self.project_by_name.items():
            proj_dirpath = join(base_dirpath, pname)
            if not verify_dir(proj_dirpath, silent=True):
                proj_dirpath = base_dirpath

            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(proj_dirpath, az_proj_name)
            for sample in project.sample_by_name.values():
                sample.source_fastq_dirpath = project.dirpath
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

        self.basecall_stat_html_reports = []

        self.get_fastq_regexp_fn = get_hiseq4000_miseq_regexp
コード例 #18
0
def remove_dups_picard(cnf, bam_fpath):
    picard = get_system_path(cnf, 'java', 'picard')
    if not picard:
        critical('No picard in the system')

    info('Running picard dedup for "' + basename(bam_fpath) + '"')

    dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt')
    output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup')

    cmdline = '{picard} MarkDuplicates' \
              ' I={bam_fpath}' \
              ' O={output_fpath}' \
              ' METRICS_FILE={dup_metrics_txt}' \
              ' REMOVE_DUPLICATES=True' \
              ' VALIDATION_STRINGENCY=LENIENT'
    res = call(cnf,
               cmdline.format(**locals()),
               output_fpath=output_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)

    if res != output_fpath:  # error occurred, try to correct BAM and restart
        warn('Picard deduplication failed for "' + basename(bam_fpath) +
             '". Fixing BAM and restarting Picard...')
        bam_fpath = _fix_bam_for_picard(cnf, bam_fpath)
        res = call(cnf,
                   cmdline.format(**locals()),
                   output_fpath=output_fpath,
                   stdout_to_outputfile=False,
                   exit_on_error=False)

    if res == output_fpath:
        dup_rate = _parse_picard_dup_report(dup_metrics_txt)
        assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate)
        info('Duplication rate (picard): ' + str(dup_rate))
        return output_fpath
    else:
        return None
コード例 #19
0
def get_sample_column_index(vcf_fpath, samplename, suppress_warn=False):
    vcf_header_samples = read_sample_names_from_vcf(vcf_fpath)

    if len(vcf_header_samples) == 0:
        return None

    if len(vcf_header_samples) == 1:
        if vcf_header_samples[0].lower() == samplename.lower():
            return 0
        else:
            return None

    name = next((name for name in vcf_header_samples if name.lower() == samplename.lower()), None)
    if name is None:
        if not suppress_warn:
            warn('No sample ' + samplename + ' in header with samples ' + ', '.join(vcf_header_samples) + ' for ' + vcf_fpath)
        name = next((name for name in vcf_header_samples if name.lower() != 'none'), None)
        if name is None:
            err('All sample names are None.')
        return None
    else:
        return vcf_header_samples.index(name)
コード例 #20
0
def detect_sys_cnf_by_location():
    if is_uk():
        res = defaults['sys_cnfs']['uk']
    elif is_sweden():
        res = defaults['sys_cnfs']['sweden']
    elif is_china():
        res = defaults['sys_cnfs']['china']
    elif is_us():
        res = defaults['sys_cnfs']['us']
    elif is_cloud():
        res = defaults['sys_cnfs']['cloud']
    elif is_local():
        res = defaults['sys_cnfs']['local']
    elif is_ace():
        res = defaults['sys_cnfs']['ace']
    elif is_chihua():
        res = defaults['sys_cnfs']['chihua']
    else:
        warn('Warning: could not detect location by hostname: ' +
             socket.gethostname() + '. Using local')
        res = defaults['sys_cnfs']['local']
    return res
コード例 #21
0
def del_jobs(cnf, jobs_running):
    done_job_ids = [j.job_id for j in jobs_running if not j.is_done and not j.not_wait]
    if done_job_ids:
        qdel = get_system_path(cnf, 'qdel', is_critical=False)
        command = ' '.join(done_job_ids)
        if qdel:
            res = call(cnf, qdel + ' ' + command, exit_on_error=False, silent=not cnf.debug)
            if res == 0:
                info('All running jobs for this project has been deleted from queue.')
            else:
                warn('Can\'t run qdel. Please kill the remaning jobs manually using the following command:')
                warn('  qdel ' + command)
        else:
            warn('Can\'t find qdel. Please kill the remaning jobs manually using the following command:')
            warn('  qdel ' + command)
        info()
コード例 #22
0
def parse_response(res, mut):
    ok = True
    for f in ['allele_origin', 'clinical_significance', 'genomic_coordinates']:
        if f not in res:
            warn('No ' + f + ' in SolveBio for mutation ' + str(mut))
            ok = False
    if not ok:
        return None

    rec = SolvebioRecord()

    rec.clinsig = res['clinical_significance']
    if rec.clinsig.lower() == 'other':
        rec.clinsig = 'Uncertain'

    coords = res['genomic_coordinates']
    rec.url = 'https://astrazeneca.solvebio.com/variant/GRCH37-{chrom}-{start}-{stop}-{alt}'.format(
        chrom=coords['chromosome'],
        start=coords['start'],
        stop=coords['stop'],
        alt=res['allele'])

    return rec
コード例 #23
0
def markdup_sam(cnf, in_sam_fpath, samblaster=None):
    """Perform non-stream based deduplication of SAM input files using samblaster.
    """
    if not samblaster:
        samblaster = get_system_path(cnf, 'samblaster')
        if not samblaster:
            warn('No samblaster, can\'t mark duplicates.')
            return None

    out_sam_fpath = add_suffix(in_sam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_sam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format(
        **locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_sam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_sam_fpath
    else:
        return None
コード例 #24
0
def flag_stat(cnf, bam):
    output_fpath = join(cnf.work_dir, basename(bam) + '_flag_stats')
    cmdline = 'flagstat {bam}'.format(**locals())
    call_sambamba(cnf,
                  cmdline,
                  output_fpath=output_fpath,
                  bam_fpath=bam,
                  command_name='flagstat')
    stats = dict()
    with open(output_fpath) as f:
        lines = f.readlines()
        for stat, fun in [
            ('total', number_of_reads),
            ('duplicates', number_of_dup_reads),  # '-f 1024'
            ('mapped', number_of_mapped_reads),  # '-F 4'
            ('properly paired', number_of_properly_paired_reads)
        ]:  # '-f 2'
            try:
                val = next(l.split()[0] for l in lines if stat in l)
            except StopIteration:
                warn('Cannot extract ' + stat + ' from flagstat output ' +
                     output_fpath + '. Trying samtools view -c...')
                val = None
            else:
                try:
                    val = int(val)
                except ValueError:
                    warn('Cannot parse value ' + str(val) + ' from ' + stat +
                         ' from flagstat output ' + output_fpath +
                         '. Trying samtools view -c...')
                    val = None
            if val is not None:
                stats[stat] = val
            else:
                stats[stat] = fun(cnf, bam)
    return stats
コード例 #25
0
def create_oncoprints_link(cnf, bcbio_structure, project_name=None):
    if is_us(): loc = exposing.us
    # elif is_uk(): loc = exposing.uk
    else:
        loc = exposing.local
        return None

    if not bcbio_structure.variant_callers:
        info('No varianting calling performed, not generating Oncoprints')
        return None
    clinical_report_caller = \
        bcbio_structure.variant_callers.get('vardict') or \
        bcbio_structure.variant_callers.get('vardict-java')
    if not clinical_report_caller:
        err('Warning: vardict is not in the variant callers list, this not generating Oncoprints')
        return None

    step_greetings('Creating Oncoprints link')
    zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS'
    if not isdir(zhongwu_data_query_dirpath):
        warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.')
        return None

    vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name)
    vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname)
    cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix)

    cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath

    samples = sorted(bcbio_structure.samples)
    cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir)
    study_name = re.sub('[\.\-:&]', '_', cnf.project_name)

    check_genome_resources(cnf)

    data_query_dirpath = join(loc.dirpath, 'DataQueryTool')

    data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt')
    info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt')
    altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath)
    if not altered_genes:
        err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.')
        return None

    print_info_txt(cnf, samples, info_fpath)

    data_ext_fpath = data_fpath.replace('/home/', '/users/')
    info_ext_fpath = info_fpath.replace('/home/', '/users/')

    # optional:
    data_symlink = join(data_query_dirpath, study_name + '.data.txt')
    info_symlink = join(data_query_dirpath, study_name + '.info.txt')
    (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink)
    (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink)

    properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties')
    add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath)

    genes = '%0D%0A'.join(altered_genes)
    data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?'
        'analysis=oncoprint&'
        'study={study_name}&'
        'gene={genes}&'
        'order=on&'
        'freq=50&'
        'nocheckgenes=true&'
        'submit=Submit'
        .format(**locals()))

    info()
    info('Information about study was added in Data Query Tool, URL is ' + data_query_url)
    return data_query_url
コード例 #26
0
def write_vcfs(cnf, var_samples, output_dirpath, caller_name,
               vcf2txt_res_fpath, mut_res_fpath, threads_num):
    info('')
    info('-' * 70)
    info('Writing VCFs')

    variants_by_sample = defaultdict(dict)
    mutations_by_sample = defaultdict(set)

    info('Collecting passed variants...')
    with open(mut_res_fpath) as fh:
        for l in fh:
            ts = l.split('\t')
            s_name, chrom, pos, alt = ts[0], ts[1], ts[2], ts[5]
            mutations_by_sample[s_name].add((chrom, pos, alt))

    info('Collecting all vcf2txt variants...')
    with open(vcf2txt_res_fpath) as vcf2txt_f:
        pass_col = None
        for l in vcf2txt_f:
            if l.startswith('Sample'):
                pass_col = l.split('\t').index('PASS')
            else:
                ts = l.split('\t')
                s_name, chrom, pos, alt = ts[0], ts[1], ts[2], ts[5]
                filt = ts[pass_col]
                variants_by_sample[s_name][(chrom, pos, alt)] = filt

    info()

    info('Writing filtered VCFs in ' + str(threads_num) + ' threads')
    try:
        Parallel(n_jobs=threads_num) \
            (delayed(postprocess_vcf) \
                (None,
                 cnf.work_dir,
                 var_sample,
                 caller_name,
                 variants_by_sample[var_sample.name],
                 mutations_by_sample[var_sample.name],
                 vcf2txt_res_fpath)
                 for var_sample in var_samples)
        info('Done postprocessing all filtered VCFs.')

    except OSError:
        err(traceback.format_exc())
        warn('Running sequencially instead in ' + str(threads_num) +
             ' threads')
        try:
            Parallel(n_jobs=1) \
                (delayed(postprocess_vcf) \
                    (None,
                     cnf.work_dir,
                     var_sample,
                     caller_name,
                     variants_by_sample[var_sample.name],
                     mutations_by_sample[var_sample.name],
                     vcf2txt_res_fpath)
                     for var_sample in var_samples)
            info('Done postprocessing all filtered VCFs.')

        except OSError:
            err(traceback.format_exc())
            err('Cannot postprocess VCF - skipping')
            err()

    info('Filtered VCFs are written.')
コード例 #27
0
def extract_gene_names_and_filter_exons(cnf, target_bed, features_bed,
                                        features_no_genes_bed):
    gene_key_set = set()
    gene_key_list = []

    info()
    info('Getting gene list')

    # if genes_fpath:
    #     with open(genes_fpath) as f:
    #         gene_key_list = [g.strip() for g in f.read().split('\n') if g]
    #         gene_key_set = set(gene_key_list)
    #     info('Using genes from ' + genes_fpath + ', filtering exons and amplicons with this genes.')
    #     if target_bed:
    #         target_bed = filter_bed_with_gene_set(cnf, target_bed, gene_key_set)
    #     if exons_bed:
    #         exons_bed = filter_bed_with_gene_set(cnf, exons_bed, gene_key_set)
    #         exons_no_genes_bed = filter_bed_with_gene_set(cnf, exons_no_genes_bed, gene_key_set)
    # else:

    if target_bed:
        info()
        gene_key_set, gene_key_list = get_gene_keys(target_bed)
        info('Using genes from the amplicons list ' + target_bed)
        if features_bed and cnf.prep_bed is not False:
            info('Trying filtering exons with these ' +
                 str(len(gene_key_list)) + ' genes.')
            features_filt_bed = filter_bed_with_gene_set(
                cnf,
                features_bed,
                gene_key_set,
                suffix='target_genes_1st_round')
            if not verify_file(features_filt_bed):
                info()
                warn(
                    'No gene symbols from the capture BED file was found in the features BED file. Re-annotating target...'
                )
                target_bed = annotate_target(cnf, target_bed)
                #info('Merging regions within genes...')
                #target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False)
                info('Sorting amplicons_bed by (chrom, gene_name, start)')
                target_bed = sort_bed(cnf, target_bed)
                info('Getting gene names again...')
                gene_key_set, gene_key_list = get_gene_keys(target_bed)
                info()
                info(
                    'Using genes from the new amplicons list, filtering features with this genes again.'
                )
                features_filt_bed = filter_bed_with_gene_set(
                    cnf,
                    features_bed,
                    gene_key_set,
                    suffix='target_genes_2nd_round')
                if not verify_file(features_filt_bed):
                    critical(
                        'No gene symbols from the capture BED file was found in the features BED.'
                    )
            features_bed = features_filt_bed
            info('Filtering the full features file including gene records.')
            features_no_genes_bed = filter_bed_with_gene_set(
                cnf,
                features_no_genes_bed,
                gene_key_set,
                suffix='target_genes')
    elif features_no_genes_bed:
        info()
        info(
            'No target (WGS), getting the gene names from the full features list...'
        )
        gene_key_set, gene_key_list = get_gene_keys(features_no_genes_bed)
    info()

    return gene_key_set, gene_key_list, target_bed, features_bed, features_no_genes_bed
コード例 #28
0
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None):
    if features_bed is None and target_bed is None:
        warn(
            'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.'
        )
        # return None, None, None, None

    if target_bed:
        target_bed = verify_bed(target_bed, is_critical=True)

    if seq2c_bed:
        seq2c_bed = verify_bed(seq2c_bed, is_critical=True)

    if features_bed:
        features_bed = verify_bed(features_bed, is_critical=True)

    # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed):
    #     warn('Same file used for exons and amplicons: ' + features_bed)

    # Features
    features_no_genes_bed = None
    if features_bed:
        # info()
        # info('Merging regions within genes...')
        # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True)
        #
        # info()
        # info('Sorting exons by (chrom, gene name, start)')
        # exons_bed = sort_bed(cnf, exons_bed)

        info()
        info(
            'Filtering the features bed file to have only non-gene and no-transcript records...'
        )
        features_no_genes_bed = intermediate_fname(cnf, features_bed,
                                                   'no_genes')
        call(cnf,
             'grep -vw Gene ' + features_bed + ' | grep -vw Transcript',
             output_fpath=features_no_genes_bed)

    ori_target_bed_path = target_bed
    if target_bed:
        info()
        info('Remove comments in target...')
        target_bed = remove_comments(cnf, target_bed)

        info()
        info('Cut -f1,2,3,4 target...')
        target_bed = cut(cnf, target_bed, 4)

        info()
        info('Sorting target...')
        target_bed = sort_bed(cnf, target_bed)

        cols = count_bed_cols(target_bed)
        if cnf.reannotate or cols < 4:
            info()
            if not features_bed:
                critical(
                    str(cols) +
                    ' columns (less than 4), and no features to annotate regions '
                    '(please make sure you have set the "features" key in the corresponding genome section '
                    '(' + cnf.genome.name + ') in ' + cnf.sys_cnf)
            info(
                'cnf.reannotate is ' + str(cnf.reannotate) +
                ', and cols in the target BED is ' + str(cols) +
                '. Annotating target with the gene names from the "features" file '
                + features_bed + '...')
            target_bed = annotate_target(cnf, target_bed)

    def remove_no_anno(l, i):
        if l.split('\t')[3].strip() == '.': return None
        else: return l

    if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path:
        info('Seq2C bed: remove regions with no gene annotation')
        seq2c_bed = target_bed
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    elif seq2c_bed:
        info()
        info('Remove comments in seq2c bed...')
        seq2c_bed = remove_comments(cnf, seq2c_bed)

        info()
        info('Sorting seq2c bed...')
        seq2c_bed = sort_bed(cnf, seq2c_bed)

        cols = count_bed_cols(seq2c_bed)
        if cols < 4:
            info()
            info('Number columns in SV bed is ' + str(cols) +
                 '. Annotating amplicons with gene names...')
            seq2c_bed = annotate_target(cnf, seq2c_bed)
        elif 8 > cols > 4:
            seq2c_bed = cut(cnf, seq2c_bed, 4)
        elif cols > 8:
            seq2c_bed = cut(cnf, seq2c_bed, 8)
        info('Filtering non-annotated entries in seq2c bed')
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    else:
        seq2c_bed = verify_bed(cnf.genome.cds)

    if target_bed:
        info()
        # info('Merging amplicons...')
        # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False)

        info('Sorting target by (chrom, gene name, start)')
        target_bed = sort_bed(cnf, target_bed)

    return features_bed, features_no_genes_bed, target_bed, seq2c_bed
コード例 #29
0
def _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    if cnf.reuse_intermediate and verify_dir(plots_dirpath) and [
            f for f in listdir(plots_dirpath) if not f.startswith('.')
    ]:
        info('Qualimap miltisample plots exist - ' + plots_dirpath +
             ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len(
            [s.qualimap_html_fpath
             for s in samples if s.qualimap_html_fpath]) > 0:
            qualimap = get_system_path(cnf,
                                       interpreter_or_name=None,
                                       name='qualimap')

            if qualimap is not None and get_qualimap_type(qualimap) == 'full':
                qualimap_output_dir = join(cnf.work_dir,
                                           'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(cnf, samples)
                _correct_qualimap_insert_size_histogram(cnf, samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(
                    rows,
                    join(qualimap_output_dir,
                         'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir,
                                              'images_multisampleBamQcReport')
                cmdline = '{qualimap} multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(
                    **locals())
                res = call(cnf,
                           cmdline,
                           exit_on_error=False,
                           return_err_code=True,
                           env_vars=dict(DISPLAY=None),
                           output_fpath=qualimap_plots_dirpath,
                           output_is_dir=True)
                if res is None or not verify_dir(qualimap_plots_dirpath):
                    warn(
                        'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.'
                    )
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn(
                    'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.'
                )
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
コード例 #30
0
def retrieve_jira_info(url):
    try:
        from jira import JIRA
    except ImportError, e:
        warn('Cannot import JIRA: ' + str(e))
        return None