def check_genome_resources(cnf):
    if cnf.genome is None:
        critical('Please, specify genome build (one of available in ' +
                 cnf.sys_cnf +
                 ') using the --genome option (e.g., --genome hg38).')

    if not cnf.genomes:
        critical('"genomes" section is not specified in system config ' +
                 cnf.sys_cnf)

    info('Genome: ' + str(cnf.genome.name))

    for key in cnf.genome.keys():
        if key != 'name' and isinstance(cnf.genome[key], basestring):
            cnf.genome[key] = adjust_system_path(cnf.genome[key])

            if not verify_obj_by_path(cnf.genome[key], key, silent=True):
                if not cnf.genome[key].endswith('.gz') and verify_file(
                        cnf.genome[key] + '.gz', silent=True):
                    gz_fpath = cnf.genome[key] + '.gz'
                    if verify_file(gz_fpath, silent=True):
                        cnf.genome[key] = gz_fpath

    if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds:
        warn(
            'Warning: features and bed_annotation_features and cds in the system config ('
            + cnf.sys_cnf + ') must be specified.')

    if not cnf.transcripts_fpath:
        cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts(
            cnf.genome.name, ensembl=True)
def main():
    info(' '.join(sys.argv))
    info()

    parser = OptionParser(usage='Usage: ' + basename(__file__) + ' --chr chr --vcf VCF_file --samples Sample1,Sample2 '
                                                                 '--bams BAM_file1,BAM_file2 -o Output_directory '
                                                                 '--features BED_file')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--features', dest='features', help='BED file with real CDS/Exon/Gene/Transcript regions with '
                                                          'annotations (default "features" is in system_config)')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    cnf.verbose = False

    if not cnf.output_dir or not cnf.vcf_fpath or not cnf.chrom:
        critical(parser.usage)

    cnf.features = cnf.features or cnf.genome.features
    samples = [BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))]
    split_bams(cnf, samples, cnf.vcf_fpath)
    info('Done.')
Ejemplo n.º 3
0
def main():
    info(' '.join(sys.argv))
    info()
    parser = OptionParser(
        usage='Usage: ' + basename(__file__) +
        ' --bed BED_file --bam BAM_file -g hg19 -o Output_BEDGRAPH_file '
        '--work-dir work_directory --chr chromosome')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--bed', dest='bed', help='BED file.')
    parser.add_option('-g',
                      '--genome',
                      dest='chr_len_fpath',
                      help='File with chromosomes lengths.')
    parser.add_option('--work-dir', dest='work_dir', help='Work directory.')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    samples = [
        BaseSample(sample_name, None, bam=bam)
        for (sample_name,
             bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))
    ]

    if not cnf.output_dir or not cnf.bams:
        critical(parser.usage)

    safe_mkdir(cnf.output_dir)
    safe_mkdir(cnf.work_dir)
    get_regions_coverage(cnf, samples)
    info('Done.')
Ejemplo n.º 4
0
def proc_fastq(cnf, sample, l_fpath, r_fpath):
    if cnf.downsample_to:
        info('Downsampling the reads to ' + str(cnf.downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample.nname,
                                      l_fpath,
                                      r_fpath,
                                      cnf.downsample_to,
                                      output_dir=cnf.work_dir,
                                      suffix='subset')

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bwa = get_system_path(cnf, 'bwa')
    bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
    if not (sambamba and bwa and bammarkduplicates):
        critical(
            'sambamba, BWA, and bammarkduplicates are required to align BAM')
    info()
    info('Aligning reads to the reference')
    bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa,
                      bammarkduplicates, cnf.genome.bwa, cnf.is_pcr)
    bam_fpath = verify_bam(bam_fpath)
    if not bam_fpath:
        critical('Sample ' + sample + ' was not aligned successfully.')
    return bam_fpath
def _get_gene_transcripts_id(cnf):
    genes_dict = dict()
    transcripts_dict = dict()

    if not cnf.genome.all_transcripts:
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.')
    if not verify_file(cnf.genome.all_transcripts):
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.')

    info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts)

    with open_gzipsafe(cnf.genome.all_transcripts) as f:
        for i, l in enumerate(f):
            if l.startswith('#'):
                continue
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')
            if feature != 'transcript':
                continue
            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            transcript_id = _rm_quotes(_prop_dict['transcript_id'])
            #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id)
            genes_dict[gene_id] = gene_symbol
            transcripts_dict[transcript_id] = gene_symbol
    return genes_dict, transcripts_dict
Ejemplo n.º 6
0
def proc_args(argv):
    group1_name = 'Resistant'
    group2_name = 'Sensitive'

    description = 'This script find genes with mutations presented in (almost) all samples in one groups' \
                  'and (almost) not presented in another group' \
                  ' (default group names: Resistant vs Sensitive). Input is PASS.txt files from bcbio-postproc.'
    parser = OptionParser(description=description)
    parser.add_option(
        '-n',
        '--num-samples-limit',
        dest='ns',
        default=1,
        type=int,
        help=
        'For each reported gene: max number of samples WITHOUT the gene in group1, '
        'max number of samples WITH the gene in group2')

    (opts, args) = parser.parse_args(argv)

    if len(args) == 0:
        critical('No PASS.txt files provided to input.')

    variants_fpaths = [fpath for fpath in args if file_exists(fpath)]
    return opts, [group1_name, group2_name], variants_fpaths
Ejemplo n.º 7
0
def get_chr_lengths_from_seq(seq_fpath):
    chr_lengths = []

    if seq_fpath.endswith('.fai'):
        seq_fpath = splitext(seq_fpath)[0]

    if verify_file(seq_fpath + '.fai', silent=True):
        info('Reading genome index file (.fai) to get chromosome lengths')
        with open(adjust_path(seq_fpath + '.fai'), 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], line.split()[1]
                    chr_lengths.append((chrom, length))
    elif verify_file(seq_fpath, silent=True):
        info('Reading genome sequence (.fa) to get chromosome lengths')
        with open(adjust_path(seq_fpath), 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))
    else:
        critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai')
    return chr_lengths
Ejemplo n.º 8
0
def annotate_target(cnf, target_bed):
    output_fpath = intermediate_fname(cnf, target_bed, 'ann')
    if not cnf.genome.bed_annotation_features:
        return output_fpath
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    features_bed = verify_bed(
        cnf.genome.bed_annotation_features,
        is_critical=True,
        description='bed_annotation_features in system config')

    # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py'))
    # bedtools = get_system_path(cnf, 'bedtools')

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \
              '-o {output_fpath} --canonical'.format(**locals())
    # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \
    #           '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \
    #           '-o {output_fpath}'.format(**locals())
    call(cnf, cmdline, output_fpath, stdout_to_outputfile=False)

    output_fpath = remove_comments(cnf, output_fpath)

    return output_fpath
Ejemplo n.º 9
0
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names):
    fastq_by_sn = OrderedDict()

    for sn in sample_names:
        sn_fastq_fpaths = sorted(
            [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')])
        if len(sn_fastq_fpaths) == 0:
            err('Error: no fastq found for ' + sn)
            fastq_by_sn[sn] = None
        elif len(sn_fastq_fpaths) > 2:
            critical('Error: more than 2 fastq files starting with ' + sn +
                     '_R: ' + ', '.join(sn_fastq_fpaths))
        elif len(sn_fastq_fpaths) == 1:
            warn('Warning: only single fastq file is found for ' + sn +
                 '. Treating as single reads.')
            fastq_by_sn[sn] = [
                verify_file(sn_fastq_fpaths[0],
                            description='sn_fastq_fpaths[0] for ' + str(sn)),
                None
            ]
        else:
            fastq_by_sn[sn] = [
                verify_file(fpath,
                            description='fpath from sn_fastq_fpaths for ' +
                            str(sn)) for fpath in sn_fastq_fpaths
            ]

    return fastq_by_sn
Ejemplo n.º 10
0
def run_vcf2txt_vardict2mut_for_samples(cnf,
                                        var_samples,
                                        output_dirpath,
                                        vcf2txt_out_fpath,
                                        caller_name=None,
                                        threads_num=1):

    threads_num = min(len(var_samples), cnf.threads)
    info('Number of threads for filtering: ' + str(threads_num))

    safe_mkdir(output_dirpath)

    vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples}
    res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath)
    if not res:
        err('vcf2txt run returned non-0')
        return None

    # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py'))
    # if not vardict2mut_py:
    #     critical('vardict2mut_py not found')

    info('Running vardict2mut')
    res = run_vardict2mut(
        cnf, vcf2txt_out_fpath,
        add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix))
    if not res:
        critical('vardict2mut.py run returned non-0')
    mut_fpath = res
    mut_fpath = convert_gpfs_path_to_url(mut_fpath)
    info()

    info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath))
    return mut_fpath
Ejemplo n.º 11
0
def read_samples(sample2bam_fpath):
    bam_fpaths = []
    sample_names = []
    bad_bam_fpaths = []

    info('Reading sample info from ' + sample2bam_fpath)
    with open(sample2bam_fpath) as f:
        for l in f:
            if l.startswith('#'):
                continue
            l = l.replace('\n', '')
            if not l:
                continue
            sample_name = None
            if len(l.split('\t')) == 2:
                sample_name, bam_fpath = l.split('\t')
            else:
                sample_name, bam_fpath = None, l
            if not verify_bam(bam_fpath):
                bad_bam_fpaths.append(bam_fpath)
            bam_fpath = verify_bam(bam_fpath, is_critical=True)
            bam_fpaths.append(bam_fpath)

            if sample_name is None:
                sample_name = basename(splitext(bam_fpath)[0])
                if sample_name.endswith('-ready'):
                    sample_name = sample_name.split('-ready')[0]
            sample_names.append(sample_name)
            info(sample_name + ': ' + bam_fpath)

    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' + ', '.join(bad_bam_fpaths))

    return sample_names, bam_fpaths
Ejemplo n.º 12
0
def tx_tmpdir(base_dir, rollback_dirpath):
    """Context manager to create and remove a transactional temporary directory.
    """
    # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    # unique_attempts = 0
    # while os.path.exists(tmp_dir_base):
    #     if unique_attempts > 5:
    #         break
    #     tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    #     time.sleep(1)
    #     unique_attempts += 1

    # if base_dir is not None:
    #     tmp_dir_base = os.path.join(base_dir, "tx")
    # else:
    #     tmp_dir_base = os.path.join(os.getcwd(), "tx")
    if exists(rollback_dirpath):
        critical(rollback_dirpath + ' already exists')

    tmp_dir = tempfile.mkdtemp(dir=base_dir)
    safe_mkdir(tmp_dir)
    try:
        yield tmp_dir
    finally:
        if tmp_dir and exists(tmp_dir):
            os.rename(tmp_dir, rollback_dirpath)
Ejemplo n.º 13
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Ejemplo n.º 14
0
def get_system_path(cnf,
                    interpreter_or_name,
                    name=None,
                    extra_warning='',
                    suppress_warn=False,
                    is_critical=False):
    """ "name" can be:
        - key in system_into.yaml
        - relative path in the project (e.g. external/...)
        - anything in system path
    """
    interpreter = interpreter_or_name
    if name is None:
        name = interpreter_or_name
        interpreter = None

    if interpreter:
        if interpreter == 'java':
            return get_java_tool_cmdline(cnf,
                                         name,
                                         extra_warning,
                                         suppress_warn,
                                         is_critical=is_critical)

        return get_script_cmdline(cnf,
                                  interpreter,
                                  name,
                                  extra_warning=extra_warning,
                                  suppress_warn=suppress_warn,
                                  is_critical=is_critical)

    # IN SYSTEM CONFIG?
    if cnf and (cnf.resources is not None and name.lower() in cnf.resources
                and 'path' in cnf.resources[name.lower()]):

        tool_path = cnf.resources[name.lower()]['path']
        tool_path = adjust_system_path(tool_path)
        return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    # IN PROJECT ROOT DIR? IN EXTERNAL?
    for dirpath in [code_base_path]:
        tool_path = join(dirpath, name)
        if exists(tool_path):
            return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    # IN PATH?
    tool_path = which(name)
    if tool_path and exists(tool_path):
        return verify_obj_by_path(tool_path, name, is_critical=is_critical)

    msg = (name + ' was not found. You may either specify path in the system '
           'config, or load into your PATH environment variable. ' +
           extra_warning)
    if not suppress_warn:
        err(msg)
    if is_critical:
        critical(msg)
    return None
Ejemplo n.º 15
0
def _proc_path(path):
    starts = {'/mnt/Datasets': '/ngs/oncology/datasets',
              '/mnt/HiSeq': '/ngs/oncology/datasets/HiSeq/',
              '/mnt/MiSeq': '/ngs/oncology/datasets/MiSeq/'}
    if not any(path.startswith(s) for s in starts.keys()):
        critical('Error: path ' + path + ' has to start with something from ' + str(starts.keys()))
    for k, v in starts.iteritems():
        path = path.replace(k, v)
    return path
Ejemplo n.º 16
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath), Loader=Loader)
    except:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Ejemplo n.º 17
0
def read_sample_names_from_vcf(vcf_fpath):
    f = open_gzipsafe(vcf_fpath)
    basic_fields = next(
        (l.strip()[1:].split() for l in f
        if l.strip().startswith('#CHROM')), None)
    if not basic_fields:
        critical('Error: no VCF header in ' + vcf_fpath)
    if len(basic_fields) < 9:
        return []
    return basic_fields[9:]
Ejemplo n.º 18
0
def main():
    if len(sys.argv) <= 2:
        critical('Usage: ' + __file__ + ' path_to_.fa')

    seq_fpath = sys.argv[1]
    seq_fpath = verify_file(seq_fpath, is_critical=True)
    chr_lengths = get_chr_lengths_from_seq(seq_fpath)

    for c, l in chr_lengths:
        sys.stdout.write(c + '\t' + str(l) + '\n')
Ejemplo n.º 19
0
def vcf_one_per_line(cnf, vcf_fpath):
    info('Converting VCF to one-effect-per-line...')

    oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl')
    vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl'))
    call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False)
    info()

    if not verify_file(oneperline_vcf_fpath):
        critical('Error: vcf_one_per_line didn\'t generate output file.')
    return oneperline_vcf_fpath
Ejemplo n.º 20
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
Ejemplo n.º 21
0
def vcf_merge(cnf, vcf_fpaths, combined_vcf_fpath):
    vcf_merge_cmdline = get_system_path(cnf, join('ext_tools', 'vcftools', 'scripts', 'vcf-merge'))
    if vcf_merge_cmdline is None:
        critical('No vcf_merge in path')

    cmdline = vcf_merge_cmdline + ' ' + ' '.join(vcf_fpaths)
    perl_module_dirpath = abspath(join(dirname(__file__), pardir, pardir, 'ext_modules', 'perl_modules'))
    os.environ['PERL5LIB'] = perl_module_dirpath

    res = call(cnf, cmdline, combined_vcf_fpath, exit_on_error=False)
    if not res:
        return None
Ejemplo n.º 22
0
def count_mutations_freq(cnf,
                         samples,
                         vcf2txt_fpaths,
                         suffix=variant_filtering.mut_pass_suffix):
    count_in_cohort_by_vark = defaultdict(int)
    total_varks = 0
    total_duplicated_count = 0
    total_records_count = 0
    for sample_i, (sample,
                   vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)):
        met_in_this_sample = set()
        processed_fpath = add_suffix(vcf2txt_fpath, suffix)
        if not isfile(processed_fpath):
            critical(processed_fpath +
                     ' does not exist; please, rerun VarFilter.')
        with open(processed_fpath) as f:
            for line_i, l in enumerate(f):
                if line_i > 0:
                    fs = l.replace('\n', '').split()
                    if not fs:
                        continue
                    chrom, pos, db_id, ref, alt = fs[1:6]
                    vark = ':'.join([chrom, pos, ref, alt])
                    if vark in met_in_this_sample:
                        if suffix == variant_filtering.mut_pass_suffix:
                            total_duplicated_count += 1
                    else:
                        count_in_cohort_by_vark[vark] += 1
                        if suffix == variant_filtering.mut_pass_suffix:
                            met_in_this_sample.add(vark)
                            total_varks += 1
                    total_records_count += 1

    if suffix == variant_filtering.mut_pass_suffix:
        info('Counted ' + str(len(count_in_cohort_by_vark)) +
             ' different variants ' + 'in ' + str(len(samples)) +
             ' samples with total ' + str(total_varks) + ' records')
        info('Duplicated varks for this sample: ' +
             str(total_duplicated_count) + ' out of total ' +
             str(total_records_count) +
             ' records. Duplicated were not counted into cohort frequencies.')

    freq_in_cohort_by_vark = dict()
    max_freq = 0
    for vark, count in count_in_cohort_by_vark.items():
        f = float(count) / len(samples)
        freq_in_cohort_by_vark[vark] = f
        if f > max_freq:
            max_freq = f

    if suffix == variant_filtering.mut_pass_suffix:
        info('Maximum frequency in cohort is ' + str(max_freq))
    return freq_in_cohort_by_vark, count_in_cohort_by_vark
Ejemplo n.º 23
0
def get_exac_dir(cnf):
    if cnf.genome.name.startswith('hg19'):
        cnf.genome.name = 'hg19'
    elif cnf.genome.name.startswith('hg38'):
        cnf.genome.name = 'hg38'
    else:
        critical(
            'Genome ' + str(cnf.genome.name) +
            ' is not supported. Supported genomes: hg19, hg19-noalt, hg38, hg38-noalt.'
        )
    exac_dir = join(exac_data_dir, cnf.genome.name)  # temporary dir
    return exac_dir
Ejemplo n.º 24
0
 def find_raw_fastq(self, get_regexp, suf='R1'):
     fastq_fpaths = [
         join(self.source_fastq_dirpath, fname)
         for fname in os.listdir(self.source_fastq_dirpath)
         if re.match(get_regexp(self, suf), fname)
     ]
     fastq_fpaths = sorted(fastq_fpaths)
     if not fastq_fpaths:
         critical('Error: no fastq files for the sample ' + self.name +
                  ' were found inside ' + self.source_fastq_dirpath)
     info(self.name + ': found raw fastq files ' + ', '.join(fastq_fpaths))
     return fastq_fpaths
Ejemplo n.º 25
0
def bgzip_and_tabix(cnf, vcf_fpath, tabix_parameters='', **kwargs):
    gzipped_fpath = join(vcf_fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if cnf.reuse_intermediate and \
           file_exists(gzipped_fpath) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed VCF and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing VCF file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = get_system_path(cnf, 'bgzip')
    tabix = get_system_path(cnf, 'tabix')
    if not bgzip:
        err('Cannot index VCF because bgzip is not found in PATH or ' + cnf.sys_cnf)
    if not tabix:
        err('Cannot index VCF because tabix is not found in PATH or ' + cnf.sys_cnf)
    if not bgzip and not tabix:
        return vcf_fpath

    retrying = False
    while True:
        if isfile(tbi_fpath): os.remove(tbi_fpath)
        if isfile(vcf_fpath):
            if isfile(gzipped_fpath):
                 os.remove(gzipped_fpath)
            info('BGzipping VCF')
            cmdline = '{bgzip} {vcf_fpath}'.format(**locals())
            call(cnf, cmdline, None, **kwargs)
        else:
            if not verify_file(gzipped_fpath):
                err('Neither uncompressed ' + vcf_fpath + ' nor ' + gzipped_fpath + ' exist')
                return None

        info('Tabixing VCF')
        cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())

        exit_on_error = False
        if retrying:
            exit_on_error = True
        kwargs['exit_on_error'] = exit_on_error
        call(cnf, cmdline, **kwargs)
        if isfile(gzipped_fpath + '.tbi'):
            break
        if retrying:
            critical('Cannot tabix ' + vcf_fpath)
        if not isfile(vcf_fpath):
            call(cnf, 'gunzip ' + gzipped_fpath, None)
        retrying = True

    return gzipped_fpath
Ejemplo n.º 26
0
def main():
    cnf = read_opts_and_cnfs(extra_opts=[
        (['--bam'], dict(dest='bam', help='path to the BAM file')),
        (['--bed', '--capture',
          '--amplicons'], dict(dest='bed', help='capture panel/amplicons')),
        (['--pcr'],
         dict(
             dest='pcr',
             action='store_true',
             help='deduplication was not perfomed, thus do not try to dedup')),
    ],
                             required_keys=['bam'],
                             file_keys=['bam', 'bed'],
                             key_for_sample_name='bam',
                             proc_name=BCBioStructure.qualimap_name)

    index_bam(cnf, cnf.bam)
    info('Using alignment ' + cnf.bam)

    bed = ''
    if cnf.bed:
        bed = ' -gff ' + cnf.bed + ' '
        info('Using amplicons/capture panel ' + cnf.bed)

    qualimap = get_system_path(cnf, 'qualimap', is_critical=True)
    if not qualimap:
        critical('Cannot find qualimap')

    info()

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(cnf.bam)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = ' --java-mem-size=' + mem

    cmdline = (
        '{qualimap} bamqc --skip-duplicated -nt ' + str(cnf.threads) +
        mem_cmdl + ' -nr 5000 '
        '-bam {cnf.bam} -outdir {cnf.output_dir} {bed} -c -gd HUMAN').format(
            **locals())
    report_fpath = join(cnf.output_dir, 'qualimapReport.html')

    call(cnf,
         cmdline,
         output_fpath=report_fpath,
         stdout_to_outputfile=False,
         env_vars=dict(DISPLAY=None))

    info('Qualimap report: ' + str(report_fpath))
Ejemplo n.º 27
0
def proc_opts():
    parser = OptionParser(description='')
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug
    if len(args) < 1:
        critical('First argument should be a root datasets dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]
    root_dirpath = verify_dir(args[0], is_critical=True, description='Dataset directory')  # /ngs/oncology/datasets/hiseq/150521_D00443_0159_AHK2KTADXX

    info(' '.join(sys.argv))

    return root_dirpath
Ejemplo n.º 28
0
def process_one(cnf, output_dir, bam_fpath, features_bed,
                features_no_genes_bed):
    sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam)
    sample.l_fpath = cnf.l_fpath
    sample.r_fpath = cnf.r_fpath

    # if not sample.bam and sample.l_fpath and sample.r_fpath:
    #     sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath))

    info('Using alignment ' + sample.bam)

    if not bam_fpath:
        critical(sample.name + ': BAM file is required.')

    target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None
    bam_fpath = verify_file(sample.bam, is_critical=True)
    index_bam(cnf, bam_fpath)

    gene_keys_list = None
    if cnf.prep_bed is not False:
        info('Preparing the BED file.')
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)

        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
    else:
        info('The BED file is ready, skipping preparing.')
        gene_keys_set, gene_keys_list, _, _, _ = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)

    picard_ins_size_hist(cnf, sample, bam_fpath, output_dir)

    avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports(
        cnf, output_dir, sample, bam_fpath, features_bed,
        features_no_genes_bed, target_bed, gene_keys_list)

    # #if cnf.extended:
    # try:
    #     info('Generating flagged regions report...')
    #     flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom)
    #     if not flagged_report:
    #         err('Flagged regions report was not generated')
    #         err()
    # except:
    #     err(format_exc())

    return reports
Ejemplo n.º 29
0
def proc_args(argv):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--bam'], dict(dest='bam', )),
        ],
        required_keys=['bam'],
        file_keys=['bam'],
    )

    check_genome_resources(cnf)

    if not cnf.bam:
        critical('No bam file provided to input')
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    return cnf
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order):
    bed_params = BedParams()
    output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep')
    info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath)
    with open(bed_fpath, 'r') as in_f:
        with open(output_fpath, 'w') as out_f:
            for line in in_f:
                if line.startswith('#') or line.startswith(
                        'track') or line.startswith('browser'):  # header
                    bed_params.header.append(
                        line if line.startswith('#') else '#' + line)
                else:
                    cur_ncn = BedParams.calc_n_cols_needed(line)
                    if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed:
                        critical(
                            'number and type of columns should be the same on all lines!'
                        )
                    bed_params.n_cols_needed = cur_ncn
                    if line.startswith('chr'):
                        if bed_params.GRCh_names is not None and bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = False
                        if line.startswith(
                                'chrMT'
                        ):  # common misprint, correcting chrMT --> chrM
                            processed_line = '\t'.join(['chrM'] +
                                                       line.split('\t')[1:])
                        else:
                            processed_line = line
                    elif line.split(
                            '\t')[0] in BedParams.GRCh_to_hg:  # GRCh chr names
                        if bed_params.GRCh_names is not None and not bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = True
                        processed_line = '\t'.join(
                            [BedParams.GRCh_to_hg[line.split('\t')[0]]] +
                            line.split('\t')[1:])
                    else:
                        critical('incorrect chromosome name!')

                    entries = processed_line.strip().split('\t')
                    chrom = entries[0]
                    start = int(entries[1])
                    end = int(entries[2])
                    r = Region(chrom, chrom_order.get(chrom), start, end)
                    if r.is_control():
                        r.set_symbol(entries[3] if len(entries) > 3 else
                                     '{0}:{1}-{2}'.format(chrom, start, end))
                        r.rest = entries[4:] if len(entries) > 4 else None
                        bed_params.controls.append(r)
                    else:
                        out_f.write(processed_line)
    return output_fpath, bed_params
Ejemplo n.º 31
0
def _preprocess(cnf, bed_fpath, work_dirpath, chrom_order):
    bed_params = BedParams()
    output_fpath = __intermediate_fname(work_dirpath, bed_fpath, 'prep')
    info('preprocessing: ' + bed_fpath + ' --> ' + output_fpath)
    with open(bed_fpath, 'r') as in_f:
        with open(output_fpath, 'w') as out_f:
            for line in in_f:
                if line.startswith('#') or line.startswith('track') or line.startswith('browser'):  # header
                    bed_params.header.append(line if line.startswith('#') else '#' + line)
                else:
                    cur_ncn = BedParams.calc_n_cols_needed(line)
                    if bed_params.n_cols_needed is not None and cur_ncn != bed_params.n_cols_needed:
                        critical('number and type of columns should be the same on all lines!')
                    bed_params.n_cols_needed = cur_ncn
                    if line.startswith('chr'):
                        if bed_params.GRCh_names is not None and bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = False
                        if line.startswith('chrMT'):  # common misprint, correcting chrMT --> chrM
                            processed_line = '\t'.join(['chrM'] + line.split('\t')[1:])
                        else:
                            processed_line = line
                    elif line.split('\t')[0] in BedParams.GRCh_to_hg:  # GRCh chr names
                        if bed_params.GRCh_names is not None and not bed_params.GRCh_names:
                            critical('mixing of GRCh and hg chromosome names!')
                        bed_params.GRCh_names = True
                        processed_line = '\t'.join([BedParams.GRCh_to_hg[line.split('\t')[0]]] + line.split('\t')[1:])
                    else:
                        critical('incorrect chromosome name!')

                    entries = processed_line.strip().split('\t')
                    chrom = entries[0]
                    start = int(entries[1])
                    end = int(entries[2])
                    r = Region(chrom, chrom_order.get(chrom), start, end)
                    if r.is_control():
                        r.set_symbol(entries[3] if len(entries) > 3 else '{0}:{1}-{2}'.format(chrom, start, end))
                        r.rest = entries[4:] if len(entries) > 4 else None
                        bed_params.controls.append(r)
                    else:
                        out_f.write(processed_line)
    return output_fpath, bed_params