Esempio n. 1
0
def local_symlink(src, dst):
    if os.path.exists(dst):
        try:
            os.unlink(dst)
        except Exception, e:
            err('Cannot remove link ' + dst + ': ' + str(e))
            return None
Esempio n. 2
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
def leave_main_sample(cnf, vcf_fpath, samplename):
    index = get_sample_column_index(vcf_fpath, samplename)
    if index is None:
        return vcf_fpath

    # def _f1(rec):
    #     rec.samples = [sample_name]
    #     return rec
    #
    info('Keeping SAMPLE only for the first sample (' + samplename + ')')
    # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name)
    # out_fpath = extract_sample(cnf, vcf_fpath, sample_name)
    # info()

    def _f(line, i):
        if line and (line.startswith('#CHROM') or line[0] != '#'):
            ts = line.split('\t')
            return '\t'.join(ts[:9] + [ts[9 + index]])
        return line
    vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm')

    if not verify_file(vcf_fpath):
        err('Error: leave_first_sample didnt generate output file.')
        return None

    return vcf_fpath
Esempio n. 4
0
def _parse_picard_dup_report(dup_report_fpath):
    with open(dup_report_fpath) as f:
        for l in f:
            if l.startswith('## METRICS CLASS'):
                l_NEXT = None
                ind = None
                try:
                    l_LIBRARY = next(f)
                    if l_LIBRARY.startswith('LIBRARY'):
                        ind = l_LIBRARY.strip().split().index(
                            'PERCENT_DUPLICATION')
                        l_NEXT = next(f)
                        while l_NEXT.startswith(' ') or l_NEXT.startswith(
                                '\t'):
                            l_NEXT = next(f)
                except StopIteration:
                    pass
                else:
                    if l_NEXT and ind:
                        fields = l_NEXT.split()
                        if fields[0] == 'Unknown':
                            ind += 1
                        if len(fields) > ind:
                            dup_rate = 1.0 * float(fields[ind])
                            return dup_rate
    err('Error: cannot read duplication rate from ' + dup_report_fpath)
 def do_handle_oserror(cmdl,
                       out_fpath=None,
                       stderr_dump=None,
                       max_number_of_tries=20):
     res_ = None
     counter = 0
     slept = 0
     timeout = 30
     limit = 60 * 10
     while True:
         try:
             res_ = do(cmdl, out_fpath, stderr_dump=stderr_dump)
             break
         except OSError, e:
             counter += 1
             if counter >= max_number_of_tries:
                 break
             if not silent:
                 err('OSError: ' + str(e))
                 err()
             if 'Cannot allocate memory' not in str(e):
                 break
             else:
                 if slept >= limit:
                     return None
                 else:
                     if not silent:
                         err('Waiting ' + str(timeout) + ' seconds...')
                     time.sleep(timeout)
                     slept += timeout
                     if not silent:
                         err('Retrying...')
                         err()
Esempio n. 6
0
def extract_graphs(samples):  # Sample(name, fastq_fpath)
    parsed_data = OrderedDict((h, list()) for h in _header)

    for s in samples:
        if verify_file(s.fastqc_html_fpath,
                       's.fastqc_html_fpath for ' + s.name):
            with open(s.fastqc_html_fpath) as source_file_obj:
                html = source_file_obj.read()
                parts = [
                    p.split('</div>')[0]
                    for p in html.split('<div class="module">')[1:]
                ]
                # <h2><img/></h2><table></table></div>  OR  <h2><img/></h2><p><img/></p></div>
                for i, part in enumerate(parts):
                    # info('Parsing ' + _header[i])
                    # info(str(part))
                    table, graph = '', ''
                    ok_img = '<img ' + part.split('"><img')[1].split(
                        '>')[0] + '>'
                    if '<table>' in part:
                        table = '<table>' + part.split('<table>')[1]
                    if '<p><img ' in part:
                        graph = '<img ' + part.split('<p><img')[1].split(
                            '>')[0] + '>'
                    parsed_data[_header[i]].append(
                        [s.name, ok_img, graph, table])

                # module_divs = soup.find_all("div", class_="module")
                # _sort_graph_by_type(parsed_data, module_divs, s.name)
                # soup.decompose()
        else:
            err('Could not find fastqc html fpath for sample ' + s.name +
                ': ' + str(s.fastqc_html_fpath))

    return parsed_data
def __parse_id(url):
    t = url.split('NGSG-')
    if len(t) == 1:
        err('Incorrect JIRA URL ' + url)
        return None
    case_id = t[1].split('?')[0]
    return case_id
Esempio n. 8
0
def index_vcf(cnf, sample_name, filt_vcf_fpath, caller_name=None):
    if cnf is None:
        global glob_cnf
        cnf = glob_cnf

    info()
    info(sample_name + ((', ' + caller_name) if caller_name else '') +
         ': indexing')

    # for fpath in [pass_vcf_fpath, filt_vcf_fpath]:
    #     if not cnf.reuse_intermediate and not verify_file(fpath, silent=True):
    #         err(fpath + ' does not exist - cannot IGV index')
    #     else:
    #         if cnf.reuse_intermediate and verify_file(fpath + '.idx', silent=True):
    #             info('Reusing existing ' + fpath + '.idx')
    #         else:
    #             igvtools_index(cnf, fpath)

    if not cnf.reuse_intermediate and not verify_file(filt_vcf_fpath,
                                                      silent=True):
        err(filt_vcf_fpath + ' does not exist - cannot gzip and tabix')
    else:
        if cnf.reuse_intermediate and verify_file(filt_vcf_fpath + '.gz', silent=True) \
                and verify_file(filt_vcf_fpath + '.gz.tbi', silent=True):
            info(filt_vcf_fpath + '.gz and .gz.tbi exist; reusing')
        else:
            bgzip_and_tabix(cnf, filt_vcf_fpath)
Esempio n. 9
0
def main():
    args = sys.argv[1:]

    if len(args) < 2:
        sys.exit('Usage: ' + __file__ + ' bam sambamba cmdline')

    bam = args[0]
    sambamba = args[1]
    args = args[2:]
    args = [a.replace('__QUOTE__', '"').replace('""', '"') for a in args]
    err(str(args))

    index_bam(bam, sambamba)

    err()
    args = [sambamba] + args
    cmdl = ' '.join(
        (('"' + a + '"') if ' ' in a and not a[0] == '"' else a) for a in args)
    err(cmdl)
    ret_code = subprocess.call(cmdl, shell=True)
    if ret_code != 0:
        err()
        err('Ret code = ' + str(ret_code) + ', retrying...')
        indexed_bam = bam + '.bai'
        if isfile(indexed_bam):
            os.remove(indexed_bam)
        index_bam(bam, sambamba)
        subprocess.call(cmdl, shell=True)
Esempio n. 10
0
def launch_bedcoverage_hist(work_dir,
                            bed,
                            bam,
                            chr_lengths_fpath,
                            bedcov_output_fpath=None,
                            bedtools='bedtools'):
    if not bedcov_output_fpath:
        bedcov_output_fpath = join(
            work_dir,
            splitext_plus(basename(bed))[0] + '__' +
            splitext_plus(basename(bam))[0] + '_bedcov_output.txt')

    if bam.endswith('bam'):
        bam = bam_to_bed_nocnf(bam, bedtools)
    verify_file(bam,
                is_critical=True,
                description='BAM to BED conversion result')

    v = bedtools_version(bedtools)
    if v and v >= 24:
        cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format(
            **locals())
    else:
        cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format(
            **locals())
    cmdline += ' > ' + bedcov_output_fpath
    info(cmdline)
    os.system(cmdline)
    res = verify_file(bedcov_output_fpath)
    if res:
        info('Done, saved to ' + bedcov_output_fpath)
    else:
        err('Error, result is non-existent or empty')
Esempio n. 11
0
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names):
    fastq_by_sn = OrderedDict()

    for sn in sample_names:
        sn_fastq_fpaths = sorted(
            [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')])
        if len(sn_fastq_fpaths) == 0:
            err('Error: no fastq found for ' + sn)
            fastq_by_sn[sn] = None
        elif len(sn_fastq_fpaths) > 2:
            critical('Error: more than 2 fastq files starting with ' + sn +
                     '_R: ' + ', '.join(sn_fastq_fpaths))
        elif len(sn_fastq_fpaths) == 1:
            warn('Warning: only single fastq file is found for ' + sn +
                 '. Treating as single reads.')
            fastq_by_sn[sn] = [
                verify_file(sn_fastq_fpaths[0],
                            description='sn_fastq_fpaths[0] for ' + str(sn)),
                None
            ]
        else:
            fastq_by_sn[sn] = [
                verify_file(fpath,
                            description='fpath from sn_fastq_fpaths for ' +
                            str(sn)) for fpath in sn_fastq_fpaths
            ]

    return fastq_by_sn
Esempio n. 12
0
def run_vcf2txt_vardict2mut_for_samples(cnf,
                                        var_samples,
                                        output_dirpath,
                                        vcf2txt_out_fpath,
                                        caller_name=None,
                                        threads_num=1):

    threads_num = min(len(var_samples), cnf.threads)
    info('Number of threads for filtering: ' + str(threads_num))

    safe_mkdir(output_dirpath)

    vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples}
    res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath)
    if not res:
        err('vcf2txt run returned non-0')
        return None

    # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
    # if not vardict2mut_py:
    #     critical('vardict2mut_py not found')

    info('Running vardict2mut')
    res = run_vardict2mut(
        cnf, vcf2txt_out_fpath,
        add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix))
    if not res:
        critical('vardict2mut.py run returned non-0')
    mut_fpath = res
    mut_fpath = convert_gpfs_path_to_url(mut_fpath)
    info()

    info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath))
    return mut_fpath
Esempio n. 13
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
Esempio n. 14
0
def index_bam(bam_fpath, sambamba):
    indexed_bam = bam_fpath + '.bai'
    if isfile(indexed_bam):
        return
    # if not isfile(indexed_bam) or getctime(indexed_bam) < getctime(bam_fpath):
    err('Indexing BAM, writing ' + indexed_bam + '...')
    cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
    subprocess.call(cmdline, shell=True)
Esempio n. 15
0
def count_bed_cols(bed_fpath):
    with open(bed_fpath) as f:
        for l in f:
            if l and l.strip() and not l.startswith('#'):
                return len(l.split('\t'))
    # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t'))
    err('Empty bed file: ' + bed_fpath)
    return None
def get_system_path(cnf,
                    interpreter_or_name,
                    name=None,
                    extra_warning='',
                    suppress_warn=False,
                    is_critical=False):
    """ "name" can be:
        - key in system_into.yaml
        - relative path in the project (e.g. external/...)
        - anything in system path
    """
    interpreter = interpreter_or_name
    if name is None:
        name = interpreter_or_name
        interpreter = None

    if interpreter:
        if interpreter == 'java':
            return get_java_tool_cmdline(cnf,
                                         name,
                                         extra_warning,
                                         suppress_warn,
                                         is_critical=is_critical)

        return get_script_cmdline(cnf,
                                  interpreter,
                                  name,
                                  extra_warning=extra_warning,
                                  suppress_warn=suppress_warn,
                                  is_critical=is_critical)

    # IN SYSTEM CONFIG?
    if cnf and (cnf.resources is not None and name.lower() in cnf.resources
                and 'path' in cnf.resources[name.lower()]):

        tool_path = cnf.resources[name.lower()]['path']
        tool_path = adjust_system_path(tool_path)
        return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    # IN PROJECT ROOT DIR? IN EXTERNAL?
    for dirpath in [code_base_path]:
        tool_path = join(dirpath, name)
        if exists(tool_path):
            return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    # IN PATH?
    tool_path = which(name)
    if tool_path and exists(tool_path):
        return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    msg = (name + ' was not found. You may either specify path in the system '
           'config, or load into your PATH environment variable. ' +
           extra_warning)
    if not suppress_warn:
        err(msg)
    if is_critical:
        critical(msg)
    return None
Esempio n. 17
0
def calculate_coverage_use_grid(cnf, samples, output_dirpath):
    assert len(samples) > 0

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)

    chr_len_fpath = get_chr_len_fpath(cnf)
    jobs_to_wait = []

    for sample in samples:
        sample_output_dirpath = join(output_dirpath, sample.name)
        safe_mkdir(sample_output_dirpath)

    for chrom in chromosomes:
        info('Processing chromosome ' + chrom)
        avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz')
        sample_output_fpaths = [
            join(output_dirpath, sample.name, chrom + '.txt.gz')
            for sample in samples
        ]

        sample_names = ','.join(sample.name for sample in samples)
        chrom_bams = []

        for sample in samples:
            if not verify_file(sample.bam):
                err('BAM for ' + sample.name + ' is not exist!')
                continue
            output_bam_fpath = join(
                cnf.work_dir,
                basename(sample.name) + '_' + str(chrom) + '.bam')
            cmdline = '{sambamba} slice {sample.bam} {chrom}'.format(
                **locals())
            call(cnf, cmdline, output_fpath=output_bam_fpath)
            if verify_file(output_bam_fpath):
                chrom_bams.append(output_bam_fpath)

        bam_fpaths = ','.join(chrom_bams)

        if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \
                all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths):
            info(avg_cov_output_fpath + ' exists, reusing')
        else:
            j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths,
                                   sample_names, output_dirpath, chr_len_fpath)
            if j and not j.is_done:
                jobs_to_wait.append(j)
            info()

        if len(jobs_to_wait) >= cnf.threads:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
            jobs_to_wait = []
        elif not jobs_to_wait:
            info('No jobs to submit.')
    if jobs_to_wait:
        wait_for_jobs(cnf, jobs_to_wait)
Esempio n. 18
0
def finialize_annotate_file(cnf, vcf_fpath, sample, callername=None):
    # vcf_fpath = leave_first_sample(cnf, vcf_fpath)

    # if not cnf.no_check:
    #     vcf_fpath = _filter_malformed_fields(cnf, vcf_fpath)

    if not cnf.no_check and callername and 'vardict' not in callername:
        info()
        info('Adding SAMPLE=' + sample.name + ' annotation...')
        vcf_fpath = add_annotation(cnf,
                                   vcf_fpath,
                                   'SAMPLE',
                                   sample.name,
                                   number='1',
                                   type_='String',
                                   description='Sample name')

    final_vcf_fpath = join(
        cnf.output_dir,
        sample.name + (('-' + callername) if callername else '') + '.anno.vcf')
    if cnf.output_file:
        final_vcf_fpath = cnf.output_file
    if not vcf_fpath.endswith('.gz') and final_vcf_fpath.endswith('.gz'):
        final_vcf_fpath = splitext(final_vcf_fpath)[0]
    if vcf_fpath.endswith('.gz') and not final_vcf_fpath.endswith('.gz'):
        final_vcf_fpath = final_vcf_fpath + '.gz'

    info('Moving final VCF ' + vcf_fpath + ' to ' + final_vcf_fpath)
    if isfile(final_vcf_fpath):
        os.remove(final_vcf_fpath)
    shutil.copy(vcf_fpath, final_vcf_fpath)

    if cnf.qc:
        report = qc.make_report(cnf, final_vcf_fpath, sample)
        qc_dirpath = join(cnf.output_dir, 'qc')
        safe_mkdir(qc_dirpath)
        report = qc.save_report(cnf, report, sample, callername, qc_dirpath,
                                source.varqc_name)
        info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
        info('-' * 70)
        info()

    if final_vcf_fpath.endswith('.gz'):
        if not is_gz(final_vcf_fpath):
            err(final_vcf_fpath + ' is in incorrect gzip format')
            anno_vcf_fpath_ungz = splitext(final_vcf_fpath)[0]
            anno_vcf_fpath_gz = final_vcf_fpath
            os.rename(anno_vcf_fpath_gz, anno_vcf_fpath_ungz)
        else:
            info(final_vcf_fpath + ' is a good gzipped file.')
            return [final_vcf_fpath]
    else:
        info('Compressing and indexing with bgzip+tabix ' + final_vcf_fpath)
        final_vcf_fpath = bgzip_and_tabix(cnf, final_vcf_fpath)
        info('Saved VCF again to ' + final_vcf_fpath)

    return [final_vcf_fpath]
Esempio n. 19
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath), Loader=Loader)
    except:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
 def merge_patients(patients):
     gender = None
     genders = set(p.gender for p in patients if p.gender)
     if genders:
         if len(genders) > 1:
             err('Different genders detected for the same sample: ' +
                 str(genders))
         gender = next(genders)
     return Patient(gender)
Esempio n. 21
0
def main():
    cnf = proc_args(sys.argv)
    bigwig_fpath = process_bam(cnf, cnf.bam)
    if isfile(bigwig_fpath) and cnf.project_name and cnf.sample:
        create_jbrowse_symlink(cnf.genome.name, cnf.project_name, cnf.sample,
                               bigwig_fpath)
        info('BAM was successfully converted.')
    elif not isfile(bigwig_fpath):
        err('BAM was not converted to BigWig.')
Esempio n. 22
0
def get_db_path(cnf, dbconf, dbname):
    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = dbconf.get('path')
        if not db_path:
            err('Please, provide a path to "' + dbname +
                '" in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return None
    return verify_file(db_path, is_critical=True)
Esempio n. 23
0
    def calc_bases_within_threshs(self, depth_thresholds):
        if self.bases_within_threshs is not None:
            return self.bases_within_threshs

        if self.bases_by_depth is None:
            err('Error: self.bases_by_depth is None for ' + str(self))

        self.bases_within_threshs, self.rates_within_threshs = calc_bases_within_threshs(
            self.bases_by_depth, self.get_size(), depth_thresholds)

        return self.bases_within_threshs
Esempio n. 24
0
    def __init__(self, dirpath, az_prjname_by_subprj=None, samplesheet=None):
        info('Parsing the HiSeq project structure')
        self.kind = 'hiseq'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)

        verify_dir(self.unaligned_dirpath, is_critical=True)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        for pname, project in self.project_by_name.items():
            proj_dirpath = join(
                self.unaligned_dirpath, 'Project_' + pname.replace(
                    ' ', '-'))  #.replace('-', '_').replace('.', '_'))

            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(proj_dirpath, az_proj_name)
            for sname, sample in project.sample_by_name.items():
                sample.source_fastq_dirpath = join(
                    project.dirpath, 'Sample_' + sname.replace(
                        ' ', '-'))  #.replace('-', '_').replace('.', '_'))
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

            basecalls_symlink = join(project.dirpath, 'BaseCallsReports')
            if not exists(basecalls_symlink):
                info('Creating BaseCalls symlink ' + self.basecalls_dirpath +
                     ' -> ' + basecalls_symlink)
                try:
                    os.symlink(self.basecalls_dirpath, basecalls_symlink)
                except OSError:
                    err('Cannot create symlink')
                    traceback.print_exc()
                else:
                    info('Created')
            if exists(basecalls_symlink):
                self.basecalls_dirpath = basecalls_symlink

        self.get_fastq_regexp_fn = get_hiseq_regexp
Esempio n. 25
0
def parse_gene_counts(counts_fpath, key_gene_names, report_name,
                      keep_gene_names):
    gene_counts = defaultdict(list)
    info('Preparing ' + report_name + ' stats for expression heatmaps')
    info('Checking ' + counts_fpath)
    if not verify_file(counts_fpath):
        err('Cannot find ' + report_name + ' fpath')
        return []

    info('Reading ' + report_name + ' from ' + counts_fpath)
    samples_cols = dict()
    samples = []
    gene_col = None

    with open(counts_fpath) as f:
        for i, l in enumerate(f):
            if i == 0:
                header = l.strip().split('\t')
                gene_col = header.index('HUGO')
                samples = header[1:gene_col]
                samples_cols = {
                    sample: col + 1
                    for col, sample in enumerate(samples)
                }
                continue
            fs = l.replace('\n', '').split('\t')
            gene_name = fs[gene_col]
            if key_gene_names and gene_name not in key_gene_names:
                continue
            gene_expression_dict = {
                sample: int(float(fs[col]))
                if float(fs[col]).is_integer() else float(fs[col])
                for sample, col in samples_cols.iteritems()
            }
            if all(v < HEATMAPS_MIN_COUNT
                   for v in gene_expression_dict.values()):
                continue
            is_hidden_row = False
            name = gene_name
            if ':' in fs[0]:  ## exon number
                is_hidden_row = True
                exon_number = fs[0].split(':')[1]
                name += ':' + exon_number
            if keep_gene_names:
                is_hidden_row = True
                name = fs[0]  # use id
            gene = Counts(name,
                          gene_name=gene_name,
                          counts=gene_expression_dict,
                          is_hidden_row=is_hidden_row)
            gene_counts[gene_name].append(gene)

    return gene_counts, samples
Esempio n. 26
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
Esempio n. 27
0
def run_seq2c(cnf, output_dirpath, samples, seq2c_bed, is_wgs):
    step_greetings('Running Seq2C')

    bams_by_sample = dict()
    for s in samples:
        if not s.bam:
            err('No BAM file for ' + s.name)
            continue
        bams_by_sample[s.name] = s.bam
        # cnf.work_dir = join(ori_work_dir, source.targqc_name + '_' + s.name)
        # safe_mkdir(cnf.work_dir)
        # s.dedup_bam = intermediate_fname(cnf, s.bam, source.dedup_bam)
        # dedupped_bam_by_sample[s.name] = s.dedup_bam
        # if verify_bam(s.dedup_bam, silent=True):
        #     info(s.dedup_bam + ' exists')
        # else:
        #     info('Deduplicating bam file ' + s.dedup_bam)
        #     dedup_jobs.append(remove_dups(cnf, s.bam, s.dedup_bam, use_grid=True))

    # cnf.work_dir = ori_work_dir
    # wait_for_jobs(cnf, dedup_jobs)
    #
    # ok = True
    # for s in samples:
    #     if not dedupped_bam_by_sample.get(s.name) or not verify_bam(dedupped_bam_by_sample[s.name]):
    #         err('No BAM file for ' + s.name)
    #         ok = False
    # if not ok:
    #     err('No BAM files found for any sample, cannot run Seq2C.')
    #     return None

    info('Getting reads and cov stats')
    mapped_read_fpath = join(output_dirpath, 'mapped_reads_by_sample.tsv')
    mapped_read_fpath, samples = __get_mapped_reads(cnf, samples, bams_by_sample, mapped_read_fpath)
    info()
    if not mapped_read_fpath:
        return None

    combined_gene_depths_fpath = join(output_dirpath, 'cov.tsv')
    combined_gene_depths_fpath = __seq2c_coverage(cnf, samples, bams_by_sample, seq2c_bed, is_wgs, combined_gene_depths_fpath)
    info()
    if not combined_gene_depths_fpath:
        return None

    seq2c_report_fpath = join(output_dirpath, source.seq2c_name + '.tsv')
    seq2c_report_fpath = __final_seq2c_scripts(cnf, mapped_read_fpath, combined_gene_depths_fpath, seq2c_report_fpath)
    if not seq2c_report_fpath:
        return None

    info('Done. The results is ' + seq2c_report_fpath)
    return seq2c_report_fpath
Esempio n. 28
0
def main():
    root_dirpath = proc_opts()
    info('*' * 60)
    info()

    all_issues = []

    info('Iterating over ' + root_dirpath)
    info('-' * 60)
    info()
    for fname in os.listdir(root_dirpath):
        if fname.startswith('.'):
            continue
        info(fname)
        project_dirpath = join(root_dirpath, fname)
        if isdir(project_dirpath) \
                and isfile(join(project_dirpath, 'SampleSheet.csv')) \
                and isdir(join(project_dirpath, 'Unalign')):
            info('Unalign and SampleSheet.csv found')

            ds = DatasetStructure.create(project_dirpath, '')

            issues = []
            if not ds.project_by_name:
                err('No projects found')
            else:
                info('Projects: ' + ', '.join([p.name + ' (' + ', '.join(p.sample_by_name) + ')' for p in ds.project_by_name.values()]))
                for project in ds.project_by_name.values():
                    if not project.sample_by_name:
                        err('No samples for project ' + project.name + ' found')
                    else:
                        for i, s1 in enumerate(project.sample_by_name.values()):
                            for s2 in project.sample_by_name.values()[i + 1:]:
                                if s2.name.startswith(s1.name):
                                    issues.append('   issued samples: ' + s1.name + ' and ' + s2.name + ' from ' + project.name)

            if issues:
                all_issues.append(fname + ' created: %s, last modified: %s' %
                                  (time.ctime(os.path.getctime(project_dirpath)),
                                   time.ctime(os.path.getmtime(project_dirpath))))
                all_issues.extend(issues)
                all_issues.append('')

            info()
            info('-' * 60)

    info()
    info('Failed projects: ')
    for msg in all_issues:
        info(msg)
def verify_vcf(vcf_fpath, silent=False, is_critical=False):
    if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical):
        return None
    debug('File ' + vcf_fpath + ' exists and not empty')
    vcf = open_gzipsafe(vcf_fpath)
    debug('File ' + vcf_fpath + ' opened')
    l = next(vcf, None)
    if l is None:
        (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath)
        return None
    if not l.startswith('##fileformat=VCF'):
        (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath)
        return None

    try:
        reader = vcf_parser.Reader(vcf)
    except:
        err('Error: cannot open the VCF file ' + vcf_fpath)
        if is_critical: raise
    else:
        debug('File ' + vcf_fpath + ' opened as VCF')
        try:
            rec = next(reader)
        except IndexError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('IndexError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except ValueError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('ValueError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except StopIteration:
            debug('No records in the VCF file ' + vcf_fpath)
            if not silent:
                warn('VCF file ' + vcf_fpath + ' has no records.')
            return vcf_fpath
        except:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('Other error parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        else:
            debug('A record was read from the VCF file ' + vcf_fpath)
            return vcf_fpath
        # f = open_gzipsafe(output_fpath)
        # l = f.readline()
        # if 'Cannot allocate memory' in l:
        #     f.close()
        #     f = open_gzipsafe(output_fpath)
        #     contents = f.read()
        #     if not silent:
        #         if is_critical:
        #             critical('SnpSift failed with memory issue:\n' + contents)
        #         else:
        #             err('SnpSift failed with memory issue:\n' + contents)
        #             return None
        #     f.close()
        #     return None
        # return output_fpath
    finally:
        vcf.close()
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath):
    sambamba_output_by_experiment = dict()
    not_submitted_experiments = infos_by_key.values()
    while not_submitted_experiments:
        jobs_to_wait = []
        submitted_experiments = []
        reused_experiments = []

        for (group, uniq_key), e in infos_by_key.iteritems():
            if e not in not_submitted_experiments:
                continue
            sambamba_output_fpath = join(cnf.work_dir,
                                         uniq_key + '__mutations.bed')
            sambamba_output_by_experiment[e] = sambamba_output_fpath

            if cnf.reuse_intermediate and verify_file(sambamba_output_fpath,
                                                      silent=True):
                info(sambamba_output_fpath + ' exists, reusing')
                reused_experiments.append(e)
                continue
            else:
                if not e.sample.bam:
                    err('Sample ' + e.sample.name + ' in ' + str(group) +
                        ', ' + str(uniq_key) + ' has no BAM')
                    continue
                j = sambamba_depth(cnf,
                                   mut_bed_fpath,
                                   e.sample.bam,
                                   output_fpath=sambamba_output_fpath,
                                   only_depth=True,
                                   silent=True,
                                   use_grid=True)
                submitted_experiments.append(e)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_experiments = [
            e for e in not_submitted_experiments
            if e not in submitted_experiments and e not in reused_experiments
        ]

    return sambamba_output_by_experiment