Esempio n. 1
0
def read_samples(sample2bam_fpath):
    bam_fpaths = []
    sample_names = []
    bad_bam_fpaths = []

    info('Reading sample info from ' + sample2bam_fpath)
    with open(sample2bam_fpath) as f:
        for l in f:
            if l.startswith('#'):
                continue
            l = l.replace('\n', '')
            if not l:
                continue
            sample_name = None
            if len(l.split('\t')) == 2:
                sample_name, bam_fpath = l.split('\t')
            else:
                sample_name, bam_fpath = None, l
            if not verify_bam(bam_fpath):
                bad_bam_fpaths.append(bam_fpath)
            bam_fpath = verify_bam(bam_fpath, is_critical=True)
            bam_fpaths.append(bam_fpath)

            if sample_name is None:
                sample_name = basename(splitext(bam_fpath)[0])
                if sample_name.endswith('-ready'):
                    sample_name = sample_name.split('-ready')[0]
            sample_names.append(sample_name)
            info(sample_name + ': ' + bam_fpath)

    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths))

    return sample_names, bam_fpaths
def check_genome_resources(cnf):
    if cnf.genome is None:
        critical('Please, specify genome build (one of available in ' +
                 cnf.sys_cnf +
                 ') using the --genome option (e.g., --genome hg38).')

    if not cnf.genomes:
        critical('"genomes" section is not specified in system config ' +
                 cnf.sys_cnf)

    info('Genome: ' + str(cnf.genome.name))

    for key in cnf.genome.keys():
        if key != 'name' and isinstance(cnf.genome[key], basestring):
            cnf.genome[key] = adjust_system_path(cnf.genome[key])

            if not verify_obj_by_path(cnf.genome[key], key, silent=True):
                if not cnf.genome[key].endswith('.gz') and verify_file(
                        cnf.genome[key] + '.gz', silent=True):
                    gz_fpath = cnf.genome[key] + '.gz'
                    if verify_file(gz_fpath, silent=True):
                        cnf.genome[key] = gz_fpath

    if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds:
        warn(
            'Warning: features and bed_annotation_features and cds in the system config ('
            + cnf.sys_cnf + ') must be specified.')

    if not cnf.transcripts_fpath:
        cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts(
            cnf.genome.name, ensembl=True)
Esempio n. 3
0
def print_genes(genes, output_fpath, canon_only):
    regions = []
    already_added_gene_features = set()
    transcripts = []
    for g in genes:
        for tr in g.transcripts:
            if not canon_only or tr.is_canonical:
                transcripts.append(tr)
    for tr in sorted(transcripts, key=lambda _tr: _tr.get_key()):
        to_add_gene = all(tr2.biotype == 'protein_coding' for tr2 in tr.gene.transcripts if (tr2.is_canonical or not canon_only)) \
                      and tr.gene not in already_added_gene_features \
                      and (len(tr.gene.canonical_transcripts) == 1 or len(tr.gene.transcripts) == 1)
        if to_add_gene:
            # skip gene feature for all miRNA because there are multi-domain miRNA located in different
            # places with the same gene name
            regions.append(tr.gene)
            already_added_gene_features.add(tr.gene)
        if tr.exons:
            regions.append(tr)
            for e in tr.exons:
                regions.append(e)

    info('Writing ' + str(len(regions)) + ' regions')
    with open(adjust_path(output_fpath), 'w') as all_out:
        for r in regions:
            all_out.write(r.__str__())
def main():
    info(' '.join(sys.argv))
    info()

    cnf, bcbio_structure = bcbio_summary_script_proc_params(
        BCBioStructure.targqc_name,
        BCBioStructure.targqc_summary_dir,
        extra_opts=
        [(['--bed', '--capture', '--amplicons'],
          dict(dest='bed',
               help='BED file to run targetSeq and Seq2C analysis on.')),
         (['--exons', '--exome', '--features'],
          dict(
              dest='features',
              help=
              'Annotated CDS/Exons/Gene/Transcript BED file to make targetSeq exon/amplicon regions reports.'
          ))])

    bed_fpath, features_bed_fpath = adjust_path(cnf.bed), adjust_path(
        cnf.features)

    summarize_targqc(cnf,
                     cnf.threads or len(bcbio_structure.samples),
                     cnf.output_dir,
                     bcbio_structure.samples,
                     bed_fpath=bed_fpath,
                     features_fpath=features_bed_fpath)
Esempio n. 5
0
    def set_dirpath(self, dirpath, az_project_name):
        self.dirpath = dirpath
        self.az_project_name = az_project_name
        verify_dir(self.dirpath, is_critical=True)

        merged_dirpath = join(self.dirpath, 'merged')
        if verify_dir(merged_dirpath, silent=True):
            self.mergred_dir_found = True
            self.fastq_dirpath = self.fastqc_dirpath = merged_dirpath
        else:
            self.mergred_dir_found = False
            self.fastq_dirpath = join(self.dirpath, 'fastq')
            self.fastqc_dirpath = join(self.fastq_dirpath, 'FastQC')
        info()

        self.comb_fastqc_fpath = join(self.fastqc_dirpath, 'FastQC.html')
        self.downsample_targqc_report_fpath = None
        self.project_report_html_fpath = None

        self.downsample_metamapping_dirpath = join(self.dirpath,
                                                   'Downsample_MetaMapping')
        self.downsample_targqc_dirpath = join(self.dirpath,
                                              'Downsample_TargQC')
        self.downsample_targqc_report_fpath = join(
            self.downsample_targqc_dirpath, 'targQC.html')
        self.project_report_html_fpath = join(self.dirpath,
                                              az_project_name + '.html')
Esempio n. 6
0
def finalize_one(cnf, qc_report_fpath, qc_plots_fpaths):
    if qc_report_fpath:
        info('Saved QC report to ' + qc_report_fpath)
    if qc_plots_fpaths:
        info('Saved QC plots are in: ' + ', '.join(qc_plots_fpaths))
    elif not verify_module('matplotlib'):
        warn('Warning: QC plots were not generated because matplotlib is not installed.')
Esempio n. 7
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
    def _convert_vcf(inp_f, out_f):
        max_bunch_size = 100000
        written_records = 0
        bunch = []

        reader = vcf_parser.Reader(inp_f)
        writer = vcf_parser.Writer(out_f, reader)

        i = 0
        while True:
            rec = next(reader, None)
            if rec is None:
                break

            rec = proc_rec_fun(Record(rec, input_fpath, i), *args, **kwargs)
            if rec:
                bunch.append(rec)
                written_records += 1

            if len(bunch) >= max_bunch_size:
                writer.write_records(bunch)
                info('Written lines: ' + str(written_records))
                bunch = []
            i += 1

        writer.write_records(bunch)
        bunch = []
        info('Written lines: ' + str(written_records))
def leave_main_sample(cnf, vcf_fpath, samplename):
    index = get_sample_column_index(vcf_fpath, samplename)
    if index is None:
        return vcf_fpath

    # def _f1(rec):
    #     rec.samples = [sample_name]
    #     return rec
    #
    info('Keeping SAMPLE only for the first sample (' + samplename + ')')
    # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name)
    # out_fpath = extract_sample(cnf, vcf_fpath, sample_name)
    # info()

    def _f(line, i):
        if line and (line.startswith('#CHROM') or line[0] != '#'):
            ts = line.split('\t')
            return '\t'.join(ts[:9] + [ts[9 + index]])
        return line
    vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm')

    if not verify_file(vcf_fpath):
        err('Error: leave_first_sample didnt generate output file.')
        return None

    return vcf_fpath
Esempio n. 10
0
def get_chr_lengths_from_seq(seq_fpath):
    chr_lengths = []

    if seq_fpath.endswith('.fai'):
        seq_fpath = splitext(seq_fpath)[0]

    if verify_file(seq_fpath + '.fai', silent=True):
        info('Reading genome index file (.fai) to get chromosome lengths')
        with open(adjust_path(seq_fpath + '.fai'), 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], line.split()[1]
                    chr_lengths.append((chrom, length))
    elif verify_file(seq_fpath, silent=True):
        info('Reading genome sequence (.fa) to get chromosome lengths')
        with open(adjust_path(seq_fpath), 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))
    else:
        critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai')
    return chr_lengths
def _get_gene_transcripts_id(cnf):
    genes_dict = dict()
    transcripts_dict = dict()

    if not cnf.genome.all_transcripts:
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.')
    if not verify_file(cnf.genome.all_transcripts):
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.')

    info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts)

    with open_gzipsafe(cnf.genome.all_transcripts) as f:
        for i, l in enumerate(f):
            if l.startswith('#'):
                continue
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')
            if feature != 'transcript':
                continue
            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            transcript_id = _rm_quotes(_prop_dict['transcript_id'])
            #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id)
            genes_dict[gene_id] = gene_symbol
            transcripts_dict[transcript_id] = gene_symbol
    return genes_dict, transcripts_dict
Esempio n. 12
0
def print_sample_tracks_info(sample_name, project_name, bam_link, bigwig_link,
                             vcf_link, jbrowse_tracks_fpath):
    with open(jbrowse_tracks_fpath, 'a') as tracks:
        print >> tracks, '\n[ tracks.{sample_name} ]\n' \
                         '\nstoreClass     = JBrowse/Store/SeqFeature/BAM' \
                         '\nurlTemplate    = {bam_link}' \
                         '\nbaiUrlTemplate = {bam_link}.bai' \
                         '\nchunkSizeLimit = 100000000' \
                         '\nmaxHeight      = 10000' \
                         '\ncategory = {project_name}' \
                         '\ntype = JBrowse/View/Track/Alignments2' \
                         '\nkey  = {sample_name}\n'.format(**locals())
        print >> tracks, '\n[ tracks.{sample_name}_cov ]\n' \
                         '\nstoreClass     = JBrowse/Store/SeqFeature/BAM' \
                         '\nurlTemplate    = {bam_link}' \
                         '\nbaiUrlTemplate = {bam_link}.bai' \
                         '\nchunkSizeLimit = 100000000' \
                         '\ncategory = {project_name}' \
                         '\ntype = SNPCoverage' \
                         '\nkey  = {sample_name}_coverage_bam\n'.format(**locals())
        print >> tracks, '\n[ tracks.{sample_name}_bigwig ]\n' \
                         '\nstoreClass     = JBrowse/Store/SeqFeature/BigWig' \
                         '\nurlTemplate    = {bigwig_link}' \
                         '\ncategory = {project_name}' \
                         '\ntype = JBrowse/View/Track/Wiggle/XYPlot' \
                         '\nautoscale = local' \
                         '\nkey  = {sample_name}_coverage\n'.format(**locals())
        if vcf_link:
            print >> tracks, '\n[ tracks.{sample_name}_vcf ]\n' \
                         '\nstoreClass     = JBrowse/Store/SeqFeature/VCFTabix' \
                         '\nurlTemplate    = {vcf_link}' \
                         '\ncategory = {project_name}' \
                         '\ntype = JBrowse/View/Track/CanvasVariants' \
                         '\nkey  = {sample_name}_variants\n'.format(**locals())
    info(sample_name + ' was successfully exported to jBrowse!')
Esempio n. 13
0
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath):
    info('WGS; running vcftxt separately for each sample to save memory.')
    vcf2txt_outputs_by_vcf_fpath = OrderedDict()
    for vcf_fpath in vcf_fpath_by_sample.values():
        sample_output_fpath = add_suffix(vcf2txt_out_fpath,
                                         splitext(basename(vcf_fpath))[0])
        vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath
        info()

    info('Joining vcf2txt ouputs... (' +
         str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' +
         str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' +
         vcf2txt_out_fpath)
    with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx:
        with open(tx, 'w') as out:
            for i, (vcf_fpath, sample_output_fpath) in enumerate(
                    vcf2txt_outputs_by_vcf_fpath.items()):
                info('   Reading ' + sample_output_fpath)
                with open(sample_output_fpath) as inp:
                    for j, l in enumerate(inp):
                        if j == 0 and i != 0:
                            continue
                        out.write(l)
    if verify_file(vcf2txt_out_fpath):
        info('Saved ' + vcf2txt_out_fpath)
        return vcf2txt_out_fpath
    else:
        return None
Esempio n. 14
0
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath):
    cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True)
    cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv')

    controls = ''
    lr2gene_opt = ''
    if cnf.controls:
        controls = '-c ' + cnf.controls  # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')])
        lr2gene_opt = '-c'

    cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals())
    call(cnf, cmdline, cov2lr_output, exit_on_error=False)
    info()

    if not verify_file(cov2lr_output):
        return None

    seq2c_opts = cnf.seq2c_opts or ''

    lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True)
    cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals())
    res = call(cnf, cmdline, output_fpath, exit_on_error=False)
    info()

    if not verify_file(output_fpath):
        return None

    return res
Esempio n. 15
0
def __call(cnf, cmdline, output_fpath=None):
    stdout = open(output_fpath, 'w') if output_fpath else None
    stderr = None if cnf.debug else open('/dev/null', 'w')
    if cnf.debug:
        info(cmdline)
    ret_code = subprocess.call(cmdline, shell=True, stdout=stdout, stderr=stderr, stdin=None)
    return ret_code
def set_up_log(cnf,
               proc_name=None,
               project_name=None,
               project_fpath=None,
               output_dir=None):
    logger.proc_name = proc_name
    logger.project_name = project_name
    logger.project_fpath = project_fpath or output_dir
    logger.cnf_address = remove_quotes(cnf.email) if cnf.email else ''
    logger.smtp_host = cnf.smtp_host

    if cnf.log_dir:
        log_fname = (proc_name + '_' if proc_name else
                     '') + (cnf.sample + '_' if cnf.sample else '') + 'log.txt'
        log_fpath = join(cnf.log_dir, log_fname)

        if file_exists(log_fpath):
            timestamp = datetime.datetime.fromtimestamp(
                os.stat(log_fpath).st_mtime)
            mv_log_fpath = log_fpath + '.' + timestamp.strftime(
                "%Y-%m-%d_%H-%M-%S")
            try:
                if isfile(mv_log_fpath):
                    os.remove(mv_log_fpath)
                if not isfile(mv_log_fpath):
                    os.rename(log_fpath, mv_log_fpath)
            except OSError:
                pass
        info('log_fpath: ' + log_fpath)
        info()
        logger.log_fpath = cnf.log = log_fpath
Esempio n. 17
0
def _submit_job(cnf,
                step,
                sample_name='',
                wait_for_steps=None,
                threads=1,
                is_critical=True,
                **kwargs):
    tool_cmdline = get_system_path(cnf,
                                   step.interpreter,
                                   step.script,
                                   is_critical=is_critical)
    if not tool_cmdline:
        return False

    kwargs['sample_name'] = sample_name
    cmdline = tool_cmdline + ' ' + step.param_line.format(**kwargs)

    info(step.name)

    job = submit_job(cnf,
                     cmdline,
                     job_name=step.job_name(sample_name),
                     wait_for_steps=wait_for_steps,
                     threads=threads)

    info()
    return job
Esempio n. 18
0
def _symlink_vcfs(callers, datestamp_var_dirpath):
    errory = []
    for caller in callers:
        info(caller.name)
        for sample in caller.samples:
            info(sample.name)

            filt_vcf_fpath = sample.find_filt_vcf_by_callername(caller.name)
            if not verify_file(filt_vcf_fpath):
                errory.append([sample.name, caller.name, filt_vcf_fpath])
            else:
                base_filt_fpath = filt_vcf_fpath[:
                                                 -3] if filt_vcf_fpath.endswith(
                                                     '.gz') else filt_vcf_fpath
                for fpath in [
                        base_filt_fpath + '.gz', base_filt_fpath + '.idx',
                        base_filt_fpath + '.gz.tbi'
                ]:
                    if verify_file(fpath, silent=True):
                        _symlink_to_dir(fpath, sample.dirpath)
                        # _symlink_to_dir(fpath, datestamp_var_dirpath)

            BCBioStructure.move_vcfs_to_var(sample)

    return errory
Esempio n. 19
0
def annotate_target(cnf, target_bed):
    output_fpath = intermediate_fname(cnf, target_bed, 'ann')
    if not cnf.genome.bed_annotation_features:
        return output_fpath
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    features_bed = verify_bed(
        cnf.genome.bed_annotation_features,
        is_critical=True,
        description='bed_annotation_features in system config')

    # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py'))
    # bedtools = get_system_path(cnf, 'bedtools')

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \
              '-o {output_fpath} --canonical'.format(**locals())
    # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \
    #           '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \
    #           '-o {output_fpath}'.format(**locals())
    call(cnf, cmdline, output_fpath, stdout_to_outputfile=False)

    output_fpath = remove_comments(cnf, output_fpath)

    return output_fpath
Esempio n. 20
0
def launch_bedcoverage_hist(work_dir,
                            bed,
                            bam,
                            chr_lengths_fpath,
                            bedcov_output_fpath=None,
                            bedtools='bedtools'):
    if not bedcov_output_fpath:
        bedcov_output_fpath = join(
            work_dir,
            splitext_plus(basename(bed))[0] + '__' +
            splitext_plus(basename(bam))[0] + '_bedcov_output.txt')

    if bam.endswith('bam'):
        bam = bam_to_bed_nocnf(bam, bedtools)
    verify_file(bam,
                is_critical=True,
                description='BAM to BED conversion result')

    v = bedtools_version(bedtools)
    if v and v >= 24:
        cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format(
            **locals())
    else:
        cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format(
            **locals())
    cmdline += ' > ' + bedcov_output_fpath
    info(cmdline)
    os.system(cmdline)
    res = verify_file(bedcov_output_fpath)
    if res:
        info('Done, saved to ' + bedcov_output_fpath)
    else:
        err('Error, result is non-existent or empty')
Esempio n. 21
0
def parse_variants(fpath):
    sample_column_name = 'Sample'
    gene_column_name = 'Gene'

    genes_per_sample = dict()
    with open(fpath) as f:
        header = f.readline().split('\t')
        if sample_column_name not in header:
            warn('"' + sample_column_name + '" is not found in ' + fpath +
                 ' header, skipping this file!')
            return genes_per_sample
        else:
            sample_column_id = header.index(sample_column_name)
        if gene_column_name not in header:
            warn('"' + gene_column_name + '" is not found in ' + fpath +
                 ' header, skipping this file!')
            return genes_per_sample
        else:
            gene_column_id = header.index(gene_column_name)
        for line in f:
            line = line.split('\t')
            sample = line[sample_column_id]
            gene = line[gene_column_id]
            if sample not in genes_per_sample:
                genes_per_sample[sample] = set()
            genes_per_sample[sample].add(gene)
    info('Found info for %d samples:' % len(genes_per_sample))
    for k, v in genes_per_sample.items():
        info('\t%s (%d unique genes)' % (k, len(v)))
    return genes_per_sample
Esempio n. 22
0
    def __init__(self, dirpath, az_prjname_by_subprj, samplesheet=None):
        info('Parsing the NextSeq500 project structure')
        self.kind = 'nextseq500'
        DatasetStructure.__init__(self,
                                  dirpath,
                                  az_prjname_by_subprj,
                                  samplesheet=samplesheet)
        info('az_prjname_by_subprj: ' + str(az_prjname_by_subprj))

        verify_dir(self.unaligned_dirpath, is_critical=True)

        for pname, project in self.project_by_name.items():
            az_proj_name = az_prjname_by_subprj.get(pname) if not isinstance(
                az_prjname_by_subprj, basestring) else az_prjname_by_subprj
            if az_proj_name is None:
                if len(self.project_by_name) > 1:
                    warn(
                        'Warn: cannot correspond subproject ' + pname +
                        ' and project names and JIRA cases. '
                        'Please, follow the SOP for multiple-project run: http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Pre+Processing+QC+Reporting'
                    )
                    continue
                az_proj_name = az_prjname_by_subprj.values()[0]

            project.set_dirpath(self.unaligned_dirpath, az_proj_name)
            for sample in project.sample_by_name.values():
                sample.source_fastq_dirpath = project.dirpath
                sample.set_up_out_dirs(project.fastq_dirpath,
                                       project.fastqc_dirpath,
                                       project.downsample_targqc_dirpath)

        self.basecall_stat_html_reports = self.__get_basecall_stats_reports()

        self.get_fastq_regexp_fn = get_nextseq500_regexp
Esempio n. 23
0
    def _proc_file(inp_f, out_f, ctx=None):
        max_bunch_size = 1000 * 1000
        written_lines = 0
        bunch = []

        for i, line in enumerate(inp_f):
            clean_line = line.replace('\n', '')
            if clean_line:
                if ctx:
                    new_l = proc_line_fun(clean_line, i, ctx)
                else:
                    new_l = proc_line_fun(clean_line, i)
                if new_l is not None:
                    bunch.append(new_l + '\n')
                    written_lines += 1
            else:
                bunch.append(line)
                written_lines += 1

            if len(bunch) >= max_bunch_size:
                out_f.writelines(bunch)
                info('Written lines: ' + str(written_lines))
                bunch = []

        out_f.writelines(bunch)
        info('Written lines: ' + str(written_lines))
Esempio n. 24
0
def write_to_sqlite(work_dir,
                    jira_case,
                    project_list_fpath,
                    country_id,
                    project_name,
                    samples_num=None,
                    analysis_dirpath=None,
                    html_report_url=None):
    info('Reading project list ' + project_list_fpath)
    conn = sqlite3.connect(project_list_fpath)
    c = conn.cursor()

    pid = project_name

    d = dict()
    if analysis_dirpath:
        d['Analyses_directory_' +
          (country_id if not is_local() else 'US')] = analysis_dirpath
    if project_name and (
            analysis_dirpath or not __unquote(d['Name'])
    ):  # update only if running after bcbio, or no value there at all
        d['Name'] = project_name
    if html_report_url and (
            analysis_dirpath or not __unquote(d['HTML_report_path'])
    ):  # update only if running after bcbio, or no value there at all
        d['HTML_report_path'] = html_report_url

    if jira_case:
        d['JIRA_URL'] = jira_case.url
        # if 'Updated By' in d and __unquote(d['Updated By']):
        d['Updated_By'] = getpass.getuser()
        if jira_case.description:
            d['Description'] = jira_case.summary
        if jira_case.data_hub:
            d['Data_Hub'] = jira_case.data_hub
        if jira_case.type:
            d['Type'] = jira_case.type
        if jira_case.department:
            d['Department'] = jira_case.department
        if jira_case.division:
            d['Division'] = jira_case.division
        if jira_case.assignee:
            d['Assignee'] = jira_case.assignee
        if jira_case.reporter:
            d['Reporter'] = jira_case.reporter

    if samples_num:
        d['Sample_Number'] = str(samples_num)

    d['Datestamp'] = timestamp()

    cmdl = '''
IF EXISTS (SELECT * FROM project WHERE PID="{pid}" AND Name="{project_name}")
    UPDATE project SET (...) WHERE PID="{pid}" AND Name="{project_name}"
ELSE
    INSERT INTO project VALUES (...)
'''.format(pid=pid, project_name=project_name)
    print cmdl
    c.execute(cmdl)
Esempio n. 25
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Esempio n. 26
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
Esempio n. 27
0
 def __init(self):
     logger.info("read/write file : " + self.__file_path + " , option : " + self.__option)
     try:
         f = open(self.__file_path, self.__option, encoding='utf-8')
         self.__file = f
     except (FileNotFoundError, IOError, ValueError) as error:
         self.__is_exists_file = False
         logger.error(self.__file_path + " is error : " + error)
Esempio n. 28
0
def add_project_to_exac(cnf):
    info('Adding project to ExAC database')
    exac_venv_pythonpath = join(exac_venv_dir, 'bin', 'python')
    if is_local():
        exac_venv_pythonpath = 'python'
    cmdline = exac_venv_pythonpath + ' ' + join(exac_code_dir, 'manage.py') + ' ' + 'add_project' + \
              ' ' + cnf.project_name + ' ' + cnf.genome.name
    call(cnf, cmdline)
Esempio n. 29
0
def get_padded_bed_file(cnf, bed, genome, padding):
    info('Making bed file for padded regions...')
    bedtools = get_system_path(cnf, 'bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(cnf, bed, 'padded')
    call(cnf, cmdline, output_fpath)
    return output_fpath
Esempio n. 30
0
def _split_reference_by_priority(cnf, features_bed_fpath):
    features = ['CDS', 'Exon', 'Transcript', 'Gene']
    info('Splitting the reference file into ' + ', '.join(features))
    features_and_beds = []
    for f in features:
        features_and_beds.append(
            (f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f)))
    return features_and_beds
Esempio n. 31
0
def _make_targetcov_symlinks(samples):
    for sample in samples:
        new_link = join(dirname(dirname(sample.targetcov_detailed_txt)),
                        basename(sample.targetcov_detailed_txt))
        if exists(new_link):
            os.unlink(new_link)
        symlink_plus(sample.targetcov_detailed_txt, new_link)
        info('TargetCov TXT symlink saved to ' + new_link)
Esempio n. 32
0
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order):
    bed_params = BedParams()
    output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep')
    info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath)
    with open(bed_fpath, 'r') as in_f:
        with open(output_fpath, 'w') as out_f:
            for line in in_f:
                if line.startswith('#') or line.startswith('track') or line.startswith('browser'):  # header
                    bed_params.header.append(line if line.startswith('#') else '#' + line)
                else:
                    cur_ncn = BedParams.calc_n_cols_needed(line)
                    if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed:
                        critical('number and type of columns should be the same on all lines!')
                    bed_params.n_cols_needed = cur_ncn
                    if line.startswith('chr'):
                        if bed_params.GRCh_names is not None and bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = False
                        if line.startswith('chrMT'):  # common misprint, correcting chrMT --> chrM
                            processed_line = '\t'.join(['chrM'] + line.split('\t')[1:])
                        else:
                            processed_line = line
                    elif line.split('\t')[0] in BedParams.GRCh_to_hg:  # GRCh chr names
                        if bed_params.GRCh_names is not None and not bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = True
                        processed_line = '\t'.join([BedParams.GRCh_to_hg[line.split('\t')[0]]] + line.split('\t')[1:])
                    else:
                        critical('incorrect chromosome name!')

                    entries = processed_line.strip().split('\t')
                    chrom = entries[0]
                    start = int(entries[1])
                    end = int(entries[2])
                    r = Region(chrom, chrom_order.get(chrom), start, end)
                    if r.is_control():
                        r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
                        r.rest = entries[4:] if len(entries) > 4 else None
                        bed_params.controls.append(r)
                    else:
                        out_f.write(processed_line)
    return output_fpath, bed_params
Esempio n. 33
0
def _annotate(bed_fpath, work_dirpath, cnf):
    annotated_files = []
    input_fpath = bed_fpath
    references = [('RefSeq', cnf.genome.features), ('Ensembl', cnf.genome.ensembl)]

    for id, (db_name, db_bed_fpath) in enumerate(references):
        output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'ann_' + db_name.lower())
        info('annotating based on {db_name}: {bed_fpath} --> {output_fpath}'.format(**locals()))
        annotate_bed_py = sys.executable + ' ' + splitext(annotate_bed.__file__)[0] + '.py'

        cmdline = '{annotate_bed_py} {input_fpath} --reference {db_bed_fpath} -o {output_fpath} --genome {cnf.genome}'.format(**locals())
        __call(cnf, cmdline)

        if id < len(references) - 1:
            if cnf.debug:
                info("filtering annotated and not annotated regions into separate files:")
            only_annotated_bed = __intermediate_fname(work_dirpath, bed_fpath, 'only_ann_' + db_name.lower())
            not_annotated_bed = __intermediate_fname(work_dirpath, bed_fpath, 'not_ann_' + db_name.lower())
            with open(only_annotated_bed, 'w') as out:
                cmdline = 'grep -v -E "\.$" {output_fpath}'.format(**locals())
                if cnf.debug:
                    info(cmdline + ' > ' + only_annotated_bed)
                subprocess.call(cmdline, shell=True, stdout=out)
            with open(not_annotated_bed, 'w') as out:
                cmdline = 'grep -E "\.$" {output_fpath}'.format(**locals())
                if cnf.debug:
                    info(cmdline + ' > ' + not_annotated_bed)
                subprocess.call(cmdline, shell=True, stdout=out)
            if not cnf.debug:
                os.remove(output_fpath)
            output_fpath = only_annotated_bed
            input_fpath = not_annotated_bed
        annotated_files.append(output_fpath)
        if id != 0 and not cnf.debug:
            os.remove(input_fpath)

    return annotated_files
Esempio n. 34
0
def _postprocess(input_fpath, annotated_fpaths, bed_params, output_bed_fpath, cnf, chrom_order):
    '''
    1. Sorts.
    1. Chooses appropriate number of columns (4 or 8 for BEDs with primers).
    2. Removes duplicates.
    '''
    info('postprocessing (sorting, cutting, removing duplicates)')

    key_genes = []
    with open(adjust_path(cnf.key_genes), 'r') as f:
        for line in f:
            key_genes.append(line.strip())
    approved_genes = []
    if cnf.hgnc:
        with open(adjust_path(cnf.hgnc), 'r') as f:
            f.readline()  # header
            for line in f:
                approved_genes.append(line.split('\t')[0])

    Region.GRCh_names = bed_params.GRCh_names
    if cnf.output_grch:
        Region.GRCh_names = True
        if cnf.debug and not bed_params.GRCh_names:
            info('Changing chromosome names from hg-style to GRCh-style.')
    if cnf.output_hg:
        Region.GRCh_names = False
        if cnf.debug and bed_params.GRCh_names:
            info('Changing chromosome names from GRCh-style to hg-style.')
    Region.n_cols_needed = bed_params.n_cols_needed
    Region.key_genes = key_genes
    Region.approved_genes = approved_genes

    input_regions = set()  # we want only unique regions
    with open(adjust_path(input_fpath)) as f:
        for line in f:
            entries = line.strip().split('\t')
            chrom = entries[0]
            start = int(entries[1])
            end = int(entries[2])
            r = Region(chrom, chrom_order.get(chrom), start, end)
            r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
            r.rest = entries[4:] if len(entries) > 4 else None
            input_regions.add(r)

    annotated_regions = []
    for annotated_fpath in annotated_fpaths:
        with open(adjust_path(annotated_fpath)) as f:
            for line in f:
                entries = line.strip().split('\t')
                chrom = entries[0]
                start = int(entries[1])
                end = int(entries[2])
                r = Region(chrom, chrom_order.get(chrom), start, end)
                r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
                r.rest = entries[4:] if len(entries) > 4 else None
                annotated_regions.append(r)

    # starting to output result
    with open(adjust_path(output_bed_fpath), 'w') as f:
        for line in bed_params.header:
            f.write(line)

        annotated_regions.sort()
        i = 0
        prev_region = None
        not_a_gene_count = 0
        solid_regions = []
        prev_is_solid = False
        all_regions = []
        for cur_region in sorted(list(input_regions) + bed_params.controls):
            if not cur_region.is_control():
                assert annotated_regions[i] == cur_region, str(cur_region) + ' != ' + str(annotated_regions[i]) + '(i=%d)' % i
                if annotated_regions[i].symbol != '.':
                    cur_region.set_symbol(annotated_regions[i].symbol)
                else:
                    if prev_region is None or \
                       prev_region.chrom != cur_region.chrom or not prev_region.symbol.startswith("not_a_gene"):
                        not_a_gene_count += 1
                    cur_region.set_symbol("not_a_gene_%d" % not_a_gene_count)
                i += 1
                ambiguous_regions = [cur_region]
                while i < len(annotated_regions) and annotated_regions[i] == cur_region:  # processing duplicates
                    if annotated_regions[i].symbol != '.' and annotated_regions[i].symbol != cur_region.symbol:
                        duplicate = copy.deepcopy(cur_region)
                        duplicate.set_symbol(annotated_regions[i].symbol)
                        if duplicate.type == 'approved' and cur_region.type == 'not_approved':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                        elif annotated_regions[i].type == 'key' and cur_region.type != 'key':
                            cur_region = duplicate
                            ambiguous_regions = [cur_region]
                            if cnf.debug:
                                info('key gene priority over approved gene was used')
                        elif annotated_regions[i].type == cur_region.type:
                            ambiguous_regions.append(duplicate)
                    i += 1
                if len(ambiguous_regions) == 1:
                    if not prev_is_solid:
                        solid_regions.append(cur_region)
                    prev_is_solid = True
                    all_regions.append(cur_region)
                else:
                    if prev_is_solid:
                        solid_regions.append(prev_region)
                    prev_is_solid = False
                    all_regions.append(ambiguous_regions)
            else:
                all_regions.append(cur_region)
            prev_region = cur_region

        # outputting results
        cur_solid_id = -1
        for entry in all_regions:
            if isinstance(entry, list):  # list of ambiguous regions
                cur_region = entry[0]
                while cur_solid_id + 1 < len(solid_regions) and cur_region > solid_regions[cur_solid_id + 1]:
                    cur_solid_id += 1
                found = False
                if cur_solid_id >= 0 and cur_region > solid_regions[cur_solid_id] \
                        and cur_region.chrom == solid_regions[cur_solid_id].chrom:
                    prev_solid = solid_regions[cur_solid_id]
                    for cur_region in entry:
                        if cur_region.symbol == prev_solid.symbol:
                            found = True
                            if cnf.debug:
                                info('gene name was chosen based on previous solid region')
                            break
                if not found and cur_solid_id + 1 < len(solid_regions) and cur_region < solid_regions[cur_solid_id + 1] \
                        and cur_region.chrom == solid_regions[cur_solid_id + 1].chrom:
                    next_solid = solid_regions[cur_solid_id + 1]
                    for cur_region in entry:
                        if cur_region.symbol == next_solid.symbol:
                            found = True
                            if cnf.debug:
                                info('gene name was chosen based on next solid region')
                            break
                if not found:
                    cur_region = entry[0]
            else:
                cur_region = entry
            f.write(str(cur_region) + '\n')  # automatically outputs correct number of columns and GRCh/hg names
Esempio n. 35
0
def _read_args(args_list):
    options = [
        # (['-k', '--key-genes'], dict(
        #     dest='key_genes_fpath',
        #     help='list of key genes (they are at top priority when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt')
        #  ),
        # (['-a', '--approved-genes'], dict(
        #     dest='approved_genes_fpath',
        #     help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt')
        #  ),
        # (['-e', '--ensembl-bed'], dict(
        #     dest='ensembl_bed_fpath',
        #     help='reference BED file for annotation (Ensembl)')
        #  ),
        # (['-r', '--refseq-bed'], dict(
        #     dest='refseq_bed_fpath',
        #     help='reference BED file for annotation (RefSeq)')
        #  ),
        # (['-b', '--bedtools'], dict(
        #     dest='bedtools',
        #     help='path to bedtools',
        #     default='bedtools')
        #  ),
        (['-o', '--output-bed'], dict(
            dest='output_fpath')
         ),
        (['--debug'], dict(
            dest='debug',
            help='run in a debug more (verbose output, keeping of temporary files)',
            default=False,
            action='store_true')
         ),
        (['--output-hg'], dict(
            dest='output_hg',
            help='output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)',
            default=False,
            action='store_true')
         ),
        (['--output-grch'], dict(
            dest='output_grch',
            help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)',
            default=False,
            action='store_true')
         ),
        (['-g', '--genome'], dict(
            dest='genome',
            default='hg19')
         ),
    ]

    parser = OptionParser(usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file',
                          description='Scripts outputs a standardized version of input BED file. '
                                      'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);'
                                      ' 2) has HGNC approved symbol in forth column if annotation is '
                                      'possible and not_a_gene_X otherwise;'
                                      ' 3) is sorted based on chromosome name -> start -> end;'
                                      ' 4) has no duplicated regions (regions with the same chromosome, start and end), '
                                      'the only exception is _CONTROL_ regions.')
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    (opts, args) = parser.parse_args(args_list)

    if len(args) != 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    work_dirpath = tempfile.mkdtemp()
    info('Creating a temporary working directory ' + work_dirpath)
    if not exists(work_dirpath):
        os.mkdir(work_dirpath)

    input_bed_fpath = abspath(args[0])
    info('Input: ' + input_bed_fpath)

    output_bed_fpath = adjust_path(cnf.output_fpath)
    info('Writing to: ' + output_bed_fpath)

    # process configuration
    # for k, v in opts.__dict__.items():
    #     if k.endswith('fpath') and verify_file(v, is_critical=True):
    #         opts.__dict__[k] = verify_file(v, k)
    if cnf.output_grch and cnf.output_hg:
        info('you cannot specify --output-hg and --output-grch simultaneously!')
    # if not which(opts.bedtools):
    #     info('bedtools executable not found, please specify correct path (current is %s)! '
    #         'Did you forget to execute "module load bedtools"?' % opts.bedtools)

    # if opts.debug:
    #     info('Configuration: ')
    #     for k, v in opts.__dict__.items():
    #         info('\t' + k + ': ' + str(v))
    info()

    # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed'))

    # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed'))

    return input_bed_fpath, output_bed_fpath, work_dirpath, cnf