Esempio n. 1
0
def prepare_resource(datadir, parameters):
    genome = parameters.genome
    region_file = os.path.join(datadir, "CYP2D6_region_%s.bed" % genome)
    snp_file = os.path.join(datadir, "CYP2D6_SNP_%s.txt" % genome)
    gmm_file = os.path.join(datadir, "CYP2D6_gmm.txt")
    star_table = os.path.join(datadir, "star_table.txt")
    variant_file = os.path.join(datadir,
                                "CYP2D6_target_variant_%s.txt" % genome)
    variant_homology_file = os.path.join(
        datadir, "CYP2D6_target_variant_homology_region_%s.txt" % genome)
    haplotype_file = os.path.join(datadir, "CYP2D6_haplotype_%s.txt" % genome)
    star_combinations = get_hap_table(star_table)

    for required_file in [
            region_file,
            snp_file,
            variant_file,
            variant_homology_file,
            haplotype_file,
            gmm_file,
    ]:
        if os.path.exists(required_file) == 0:
            raise Exception("File %s not found." % required_file)

    snp_db = get_snp_position(snp_file)
    var_db = get_snp_position(variant_file)
    var_homo_db = get_snp_position(variant_homology_file)
    haplotype_db = {}
    for variant in HAPLOTYPE_VAR:
        haplotype_db.setdefault(variant,
                                get_snp_position(haplotype_file, variant))
    var_list = []
    with open(variant_file) as f:
        for line in f:
            if line[0] != "#":
                var_name = line.split()[-1]
                var_list.append(var_name)
    with open(variant_homology_file) as f:
        for line in f:
            if line[0] != "#":
                var_name = line.split()[-1]
                var_list.append(var_name)
    gmm_parameter = parse_gmm_file(gmm_file)
    region_dic = parse_region_file(region_file)
    call_parameters = resource_info(
        genome,
        gmm_parameter,
        region_dic,
        snp_db,
        var_db,
        var_homo_db,
        haplotype_db,
        var_list,
        star_combinations,
    )
    return call_parameters
Esempio n. 2
0
def main():
    parameters = load_parameters()
    manifest = parameters.manifest
    outdir = parameters.outDir
    genome = parameters.genome
    prefix = parameters.prefix
    reference_fasta = parameters.reference
    threads = parameters.threads
    path_count_file = parameters.countFilePath
    logging.basicConfig(level=logging.DEBUG)

    # Prepare data files
    datadir = os.path.join(os.path.dirname(__file__), "data")
    region_file = os.path.join(datadir, "CYP2D6_region_%s.bed" % genome)
    snp_file = os.path.join(datadir, "CYP2D6_SNP_%s.txt" % genome)
    gmm_file = os.path.join(datadir, "CYP2D6_gmm.txt")
    table_path = "full_star_table"
    if parameters.knownFunction:
        table_path = "known_function_star_table"
    if parameters.includeNewStar:
        table_path = "include_new_star_table"
    star_table = os.path.join(datadir, table_path, "star_table.txt")
    variant_file = os.path.join(datadir, table_path,
                                "CYP2D6_target_variant_%s.txt" % genome)
    variant_homology_file = os.path.join(
        datadir, table_path,
        "CYP2D6_target_variant_homology_region_%s.txt" % genome)
    star_combinations = get_hap_table(star_table)

    for required_file in [
            region_file,
            snp_file,
            variant_file,
            variant_homology_file,
            gmm_file,
    ]:
        if os.path.exists(required_file) == 0:
            raise Exception("File %s not found." % required_file)

    if os.path.exists(outdir) == 0:
        os.makedirs(outdir)

    snp_db = get_snp_position(snp_file)
    var_db = get_snp_position(variant_file)
    var_homo_db = get_snp_position(variant_homology_file)
    var_list = []
    with open(variant_file) as f:
        for line in f:
            if line[0] != "#":
                var_name = line.split()[-1]
                var_list.append(var_name)
    with open(variant_homology_file) as f:
        for line in f:
            if line[0] != "#":
                var_name = line.split()[-1]
                var_list.append(var_name)
    gmm_parameter = parse_gmm_file(gmm_file)
    region_dic = parse_region_file(region_file)
    resource_info = namedtuple(
        "resource_info",
        "genome gmm_parameter region_dic snp_db var_db var_homo_db var_list star_combinations",
    )
    call_parameters = resource_info(
        genome,
        gmm_parameter,
        region_dic,
        snp_db,
        var_db,
        var_homo_db,
        var_list,
        star_combinations,
    )

    out_json = os.path.join(outdir, prefix + ".json")
    out_tsv = os.path.join(outdir, prefix + ".tsv")
    final_output = {}
    with open(manifest) as read_manifest:
        for line in read_manifest:
            bam_name = line.strip()
            sample_id = os.path.splitext(os.path.basename(bam_name))[0]
            count_file = None
            if path_count_file is not None:
                count_file = os.path.join(path_count_file,
                                          sample_id + "_count.txt")
            if os.path.exists(bam_name) == 0 or (count_file is not None
                                                 and os.path.exists(count_file)
                                                 == 0):
                logging.warning("Input file for sample %s does not exist.",
                                sample_id)
            else:
                logging.info("Processing sample %s at %s", sample_id,
                             datetime.datetime.now())
                cyp2d6_call = d6_star_caller(bam_name, call_parameters,
                                             threads, count_file,
                                             reference_fasta)._asdict()
                # Use normalized coverage MAD across stable regions
                # as a sample QC measure.
                if cyp2d6_call["Coverage_MAD"] > MAD_THRESHOLD:
                    logging.warning(
                        "Sample %s has uneven coverage. CN calls may be unreliable.",
                        sample_id,
                    )
                final_output.setdefault(sample_id, cyp2d6_call)

    # Write to json
    logging.info("Writing to json at %s", datetime.datetime.now())
    with open(out_json, "w") as json_output:
        json.dump(final_output, json_output)

    # Write to tsv
    logging.info("Writing to tsv at %s", datetime.datetime.now())
    header = ["Sample", "Genotype", "Filter"]
    with open(out_tsv, "w") as tsv_output:
        tsv_output.write("\t".join(header) + "\n")
        for sample_id in final_output:
            final_call = final_output[sample_id]
            output_per_sample = [
                sample_id,
                final_call["Genotype"],
                final_call["Filter"],
            ]
            tsv_output.write("\t".join(str(a)
                                       for a in output_per_sample) + "\n")
Esempio n. 3
0
def main():
    parameters = load_parameters()
    manifest = parameters.manifest
    outdir = parameters.outDir
    genome = parameters.genome
    prefix = parameters.prefix
    threads = parameters.threads
    reference_fasta = parameters.reference
    path_count_file = parameters.countFilePath
    logging.basicConfig(level=logging.DEBUG)

    datadir = os.path.join(os.path.dirname(__file__), "data")
    # Region file to use
    region_file = os.path.join(datadir, "SMN_region_%s.bed" % genome)
    snp_file = os.path.join(datadir, "SMN_SNP_%s.txt" % genome)
    variant_file = os.path.join(datadir, "SMN_target_variant_%s.txt" % genome)
    gmm_file = os.path.join(datadir, "SMN_gmm.txt")

    for required_file in [region_file, snp_file, variant_file, gmm_file]:
        if os.path.exists(required_file) == 0:
            raise Exception('File %s not found.' % required_file)

    if os.path.exists(outdir) == 0:
        os.makedirs(outdir)

    snp_db = get_snp_position(snp_file)
    variant_db = get_snp_position(variant_file)
    gmm_parameter = parse_gmm_file(gmm_file)
    region_dic = parse_region_file(region_file)
    out_json = os.path.join(outdir, prefix + '.json')
    out_tsv = os.path.join(outdir, prefix + '.tsv')
    final_output = {}
    with open(manifest) as read_manifest:
        for line in read_manifest:
            bam_name = line.strip()
            sample_id = os.path.splitext(os.path.basename(bam_name))[0]
            count_file = None
            if path_count_file is not None:
                count_file = os.path.join(
                    path_count_file, sample_id + '_count.txt')
            if count_file is None and os.path.exists(bam_name) == 0:
                logging.warning(
                    'Input alignmet file for sample %s does not exist.', sample_id)
            elif count_file is not None and os.path.exists(count_file) == 0:
                logging.warning(
                    'Input count file for sample %s does not exist', sample_id)
            else:
                logging.info(
                    'Processing sample %s at %s', sample_id,
                    datetime.datetime.now()
                )
                smn_call = smn_cn_caller(
                    bam_name, region_dic, gmm_parameter,
                    snp_db, variant_db, threads, count_file, reference_fasta
                )
                # Use normalized coverage MAD across stable regions
                # as a sample QC measure.
                if smn_call['Coverage_MAD'] > MAD_THRESHOLD:
                    logging.warning(
                        "Sample %s has uneven coverage. CN calls may be \
                            unreliable.", sample_id)
                final_output.setdefault(sample_id, smn_call)

    # Write to json
    logging.info('Writing to json at %s', datetime.datetime.now())
    with open(out_json, 'w') as json_output:
        json.dump(final_output, json_output)

    # Write to tsv
    logging.info('Writing to tsv at %s', datetime.datetime.now())
    write_to_tsv(final_output, out_tsv)