Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--smc",     required=True,  dest="smc",     type=str,   help="SMC file for the region of interest")
    parser.add_argument("--pos",     required=True,  dest="pos",     type=int,   help="Position of STR in region")
    parser.add_argument("--mu",      required=True,  dest="mu",      type=float, help="Mutation rate for mutation model")
    parser.add_argument("--beta",    required=True,  dest="beta",    type=float, help="Length constraint for mutation model")
    parser.add_argument("--pgeom",   required=True,  dest="pgeom",   type=float, help="Geometric parameter for mutation model")
    parser.add_argument("--out",     required=True,  dest="out",     type=str,   help="Output path prefix for phased VCF (+ _strs.vcf)")
    parser.add_argument("--vcf",     required=True,  dest="vcf",     type=str,   help="Input path for VCF containing haploid STR calls")
    parser.add_argument("--samps",   required=False, dest="samps",   type=str,   help="File containing list of samples to consider")
    parser.add_argument("--thresh",  required=False, dest="thresh",  type=float, help="Posterior probability threshold required to report phasing", default=0.5)
   
    # Scaling factor from edge length to # of generations
    parser.add_argument("--gen_per_len", required=False, dest="gen_per_len", type=float, default=1.0)

    # Maximum TMRCA in generations
    parser.add_argument("--max_tmrca", required=False, dest="max_tmrca", type=int, default=25000)

    args = parser.parse_args()
    tree, leaf_names, leaf_indices = extract_newick_tree_from_smc(args.smc, args.pos)
    samples = read_sample_list(args.samps) if args.samps is not None else None
    
    # Read haploid STR genotypes
    nrepeats_dict, median_allele = read_haploid_str_gts(args.vcf, sample_set=samples)

    # Ensure that all of the sample nodes are contained within the tree
    for key in nrepeats_dict:
        if key not in leaf_indices:
            exit("ERROR: Sample %s not present in provided tree"%(key))

    # Randomly pair haploid to construct pseudodiploids (node_id_1, node_id_2, num_repeat_a, num_repeat_b)
    pair_data = pair_gts(nrepeats_dict, leaf_indices, pairs=None)

    # Only deal with tree for diploid individuals
    if len(tree.leaf_nodes()) % 2 != 0:
        exit("ERROR: Tree contains an odd number of leaves")
    
    # Construct the mutation model
    print("Constructing the mutation model")
    allele_range, max_step = determine_allele_range(args.max_tmrca, args.mu, args.beta, args.pgeom, 0, 0)
    min_allele  = -allele_range - max_step
    max_allele  = allele_range  + max_step 
    mut_model   = OUGeomSTRMutationModel(args.pgeom, args.mu, args.beta, allele_range, max_step = max_step)
    print("Min allele = %d, Max allele = %d"%(min_allele, max_allele))

    # Ensure that the observed genotypes are within the allele range
    if len(pair_data) != 0:
        min_obs_allele = min(min(map(lambda x: x[2], pair_data)), min(map(lambda x: x[3], pair_data)))
        max_obs_allele = max(max(map(lambda x: x[2], pair_data)), max(map(lambda x: x[3], pair_data)))
        if min_obs_allele < min_allele or max_obs_allele > max_allele:
            exit("ERROR: Observed allele not within mutation model's allele range: (%d, %d)"%(min_obs_allele, max_obs_allele))

    # Precompute the transition probabilities
    print("Calculating the transition probabilities")
    optimizer = MATRIX_OPTIMIZER(mut_model.trans_matrix, mut_model.min_n)
    optimizer.precompute_results()

    # Write out fixed paired information
    pairs_file = tempfile.mkstemp()[1]
    output     = open(pairs_file, "w") 
    for i in xrange(len(pair_data)):
        output.write("%d\t%d\t%d\t%d\n"%(pair_data[i][0], pair_data[i][1], pair_data[i][2]-min_allele, pair_data[i][3]-min_allele))
    output.close()

    # Write out the factor graph file
    graph_file = tempfile.mkstemp()[1]
    write_factor_graph(tree, optimizer, pair_data, min_allele, max_allele, args.gen_per_len, graph_file)

    # Run c++ package, parse results and remove temporary files
    phase_cmd_path = os.path.dirname(os.path.realpath(__file__)) + "/str-imputer"

    phase_cmd = [phase_cmd_path, graph_file, pairs_file]
    proc = subprocess.Popen(phase_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate() 
    
    # TO DO: Utilize stderr messages to ensure convergence
    #print(proc.stderr.read().strip())
    #res  = proc.stdout.read().strip()
    res = stdout.strip()

    rm_cmd = ["rm", "-f", graph_file, pairs_file]
    subprocess.call(rm_cmd)

    # Assess accuracy, output the statistics and determine the new genotypes associated with each sample
    phased_repeat_dict, accuracy_string = process_phasing_result(res, leaf_names, min_allele, min_confidence = args.thresh)
    print(accuracy_string)

    # Construct a new VCF containing the phased alleles
    write_vcf(args.vcf, phased_repeat_dict, median_allele, args.out + "_strs.vcf")
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--smc",     required=True,  dest="smc",     type=str,   help="SMC file for the region of interest")
    parser.add_argument("--chrom",   required=True,  dest="chrom",   type=str,   help="STR's Chromosome")
    parser.add_argument("--pos",     required=True,  dest="pos",     type=int,   help="Position of STR in region")
    parser.add_argument("--mu",      required=True,  dest="mu",      type=float, help="Mutation rate for mutation model")
    parser.add_argument("--beta",    required=True,  dest="beta",    type=float, help="Length constraint for mutation model")
    parser.add_argument("--pgeom",   required=True,  dest="pgeom",   type=float, help="Geometric parameter for mutation model")
    parser.add_argument("--out",     required=True,  dest="out",     type=str,   help="Output path prefix for imputed/phased VCF")
    parser.add_argument("--vcf",     required=True,  dest="vcf",     type=str,   help="VCF containing STR calls")
    parser.add_argument("--samps",   required=False, dest="samps",   type=str,   help="File containing list of samples to consider")
    parser.add_argument("--thresh",  required=False, dest="thresh",  type=float, help="Posterior probability threshold required to report phasing", default=0.5)
    parser.add_argument("--phase",   required=False, dest="phase",   action="store_true", help="Output phasing statistics", default=False)
    parser.add_argument("--impute",  required=False, dest="impute",  action="store_true", help="Output imputation statisttics",  default=False)
    parser.add_argument("--diploid", required=False, dest="diploid", action="store_true", help="VCF contains diploid STR calls (instead of haploid calls)", default=False)    
    
    # Scaling factor from edge length to # of generations
    parser.add_argument("--gen_per_len", required=False, dest="gen_per_len", type=float, default=1.0)

    # Maximum TMRCA in generations
    parser.add_argument("--max_tmrca", required=False, dest="max_tmrca", type=int, default=25000)

    args = parser.parse_args()
    if (not args.phase and not args.impute) or (args.phase and args.impute):
        exit("ERROR: Exactly one of --phase or --impute must be specified. Exiting...")
    
    tree, leaf_names, leaf_indices = extract_newick_tree_from_smc(args.smc, args.pos)
    samples = read_sample_list(args.samps) if args.samps is not None else None
    
    # Read STR genotypes
    if args.diploid:
        nrepeats_dict, median_allele = read_diploid_str_gts(args.vcf, args.chrom, args.pos, sample_set=samples)
    else:
        nrepeats_dict, median_allele = read_haploid_str_gts(args.vcf, args.chrom, args.pos, sample_set=samples)

    # Ensure that all of the sample nodes are contained within the tree
    for key in nrepeats_dict:
        if key not in leaf_indices:
            exit("ERROR: Sample %s not present in provided tree"%(key))

    if args.diploid:
        # Pair _1 and _2 nodes together
        sample_names = list(set(map(lambda x: x.split("_")[0], nrepeats_dict.keys())))
        pairs        = map(lambda x: (x+"_1", x+"_2"), sample_names)
        pair_data    = pair_gts(nrepeats_dict, leaf_indices, pairs=pairs)
    else:
        # Randomly pair haploid to construct pseudodiploids (node_id_1, node_id_2, num_repeat_a, num_repeat_b)
        pair_data = pair_gts(nrepeats_dict, leaf_indices, pairs=None)

    # Only deal with tree for even number of chromosomes
    if len(tree.leaf_nodes()) % 2 != 0:
        exit("ERROR: Tree contains an odd number of leaves")
    
    # Construct the mutation model
    print("Constructing the mutation model")
    allele_range, max_step = determine_allele_range(args.max_tmrca, args.mu, args.beta, args.pgeom, 0, 0)
    min_allele  = -allele_range - max_step
    max_allele  = allele_range  + max_step 
    mut_model   = OUGeomSTRMutationModel(args.pgeom, args.mu, args.beta, allele_range, max_step = max_step)
    print("Min allele = %d, Max allele = %d"%(min_allele, max_allele))

    # Ensure that the observed genotypes are within the allele range
    if len(pair_data) != 0:
        min_obs_allele = min(min(map(lambda x: x[2], pair_data)), min(map(lambda x: x[3], pair_data)))
        max_obs_allele = max(max(map(lambda x: x[2], pair_data)), max(map(lambda x: x[3], pair_data)))
        if min_obs_allele < min_allele or max_obs_allele > max_allele:
            exit("ERROR: Observed allele not within mutation model's allele range: (%d, %d)"%(min_obs_allele, max_obs_allele))

    # Precompute the transition probabilities
    print("Calculating the transition probabilities")
    optimizer = MATRIX_OPTIMIZER(mut_model.trans_matrix, mut_model.min_n)
    optimizer.precompute_results()

    # Write out paired information for known diploid GTs
    pairs_file = tempfile.mkstemp()[1]
    output     = open(pairs_file, "w") 
    for i in xrange(len(pair_data)):
        output.write("%d\t%d\t%d\t%d\n"%(pair_data[i][0], pair_data[i][1], pair_data[i][2]-min_allele, pair_data[i][3]-min_allele))
    output.close()

    # Write out ids for any leaves not included in the VCF (for potential imputation queries)
    ids_file = tempfile.mkstemp()[1]
    output   = open(ids_file, "w")
    for leaf_name,leaf_id in leaf_indices.items():
        if leaf_name not in nrepeats_dict:
            output.write("%d\n"%(leaf_id))
    output.close()

    # Write out the factor graph file
    graph_file = tempfile.mkstemp()[1]
    write_factor_graph(tree, optimizer, pair_data, min_allele, max_allele, args.gen_per_len, graph_file)

    # Run c++ package, parse results and remove temporary files
    # TO DO: Utilize stderr messages to ensure convergence
    cmd_path = os.path.dirname(os.path.realpath(__file__)) + "/Phaser"
    if args.phase:
        cmd = [cmd_path, "--factor-graph", graph_file, "--pair-file", pairs_file]
    elif args.impute:
        cmd = [cmd_path, "--factor-graph", graph_file, "--id-file", ids_file]
    print("Running message-passing tool")
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)#, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()
     
    res = stdout.strip()
    rm_cmd = ["rm", "-f", graph_file, pairs_file, ids_file]
    subprocess.call(rm_cmd)
    
    # Assess accuracy, output the statistics and determine the new genotypes associated with each sample
    if args.phase:
        phased_repeat_dict, accuracy_string = process_phasing_result(res, leaf_names, min_allele, min_confidence=args.thresh)
        #print(accuracy_string)
        
        # Construct a new VCF containing the phased alleles
        if args.diploid:
            write_diploid_vcf(args.vcf, args.chrom, args.pos, phased_repeat_dict, median_allele, args.out + "_phased_strs.vcf")
        else:
            write_haploid_vcf(args.vcf, args.chrom, args.pos, phased_repeat_dict, median_allele, args.out + "_phased_strs.vcf")

    # Determine the most probable posterior genotype for each sample
    elif args.impute:
        imputed_repeat_dict,posterior_dict,dosage_dict = process_imputation_result(res, leaf_names, min_allele, max_allele)
        if args.diploid:
            write_diploid_vcf(args.vcf, args.chrom, args.pos, imputed_repeat_dict, median_allele, args.out + "_imputed_strs.vcf", posteriors=posterior_dict, dosages=dosage_dict)
        else:
            write_haploid_vcf(args.vcf, args.chrom, args.pos, imputed_repeat_dict, median_allele, args.out + "_imputed_strs.vcf", posteriors=posterior_dict, dosages=dosage_dict)