def main():
    description = "Pick out the multi-exon genes from a dataset and generate families."
    args = parse_arguments(description, ["features_file", "genome", "dataset", "fasta"])
    [features_file, genome, dataset, fasta] = [args.features_file, args.genome, args.dataset, args.fasta]

    #set up global feature set and get relevant sequence features from it
    fs = Feature_Set(features_file, genome)
    fs.set_dataset(dataset)
    exons = fs.get_exons()
    exon_numbers = fs.get_exon_numbers(exons)

    output_fasta_name = "{0}_multiexon.fasta".format(fasta[:-6])

    #get multi-exon genes
    multi_exon = [i for i in exon_numbers if exon_numbers[i] > 1]

    #create a new feature set for multi-exon genes only
    fs_new = Feature_Set(features_file, genome)
    fs_new.create_dataset("{0}_multiexon".format(dataset), input_list = multi_exon)
    fs_new.set_dataset("{0}_multiexon".format(dataset))

    #also write a fasta with the ORF sequences
    names, seqs = rw.read_fasta(fasta)
    seqs = [seqs[pos] for pos, i in enumerate(names) if i in multi_exon]
    names = [i for i in names if i in multi_exon]
    rw.write_to_fasta(names, seqs, output_fasta_name)

    #find paralogous families
    transcripts = fs_new.get_transcripts()
    gene_name_dict = fs_new.get_gene_name_dict(transcripts)

    conservation.find_families(output_fasta_name, "general/{0}_multiexon".format(dataset))
def main():

    description = "Write the median motif lengths of a series of motif sets to file."
    args = parse_arguments(description, ["input_file", "output_file"])
    [input_file, output_file] = [args.input_file, args.output_file]
    
    #parse motifs from FASTA
    names, motifs = rw.read_fasta(input_file)
    motifs = [i.split("|") for i in motifs]
    motif_lengths = [[len(j) for j in i] for i in motifs]
    #write down and print out motif lengths
    with open(output_file, "w") as file:
        for pos, lengths_list in enumerate(motif_lengths):
            file.write("{0}\t{1}\n".format(names[pos], np.median(lengths_list)))
            print(np.median(lengths_list))
def CpG_frequency(fasta, hits, controls):
    '''
    Compare the CpG frequency at hit vs control sites.
    '''
    #parse fasta into dictionary
    names, seqs = rw.read_fasta(fasta)
    seqs = {names[i]: seqs[i] for i in range(len(names))}
    hit_site_counter = 0
    hit_CpG_counter = 0
    control_site_counter = 0
    control_CpG_counter = 0
    #for each transcript
    for name in hits:
        seq = seqs[name]
        #get all dinucleotides in hits/controls
        current_true_dints = [
            seq[i - 1:i + 1] for i in hits[name] if i != 0
        ] + [seq[i:i + 2] for i in hits[name] if i != (len(seq) - 1)]
        current_control_dints = [
            seq[i - 1:i + 1] for i in controls[name] if i != 0
        ] + [seq[i:i + 2] for i in controls[name] if i != (len(seq) - 1)]
        #store total number of sites
        hit_site_counter = hit_site_counter + len(current_true_dints)
        control_site_counter = control_site_counter + len(
            current_control_dints)
        #check how many are CpG
        hit_CpG_counter = hit_CpG_counter + len(
            [i for i in current_true_dints if i == "CG" or i == "GC"])
        control_CpG_counter = control_CpG_counter + len(
            [i for i in current_control_dints if i == "CG" or i == "GC"])
    #calculate overall frequency
    hit_freq = hit_CpG_counter / hit_site_counter
    control_freq = control_CpG_counter / control_site_counter
    print("Hit CpG frequency: {0}.".format(hit_freq))
    print("Control CpG frequency: {0}.".format(control_freq))
    return (hit_freq, control_freq)
def main():  
    parser = argparse.ArgumentParser(description="Prepare a clean dataset of protein-coding genes.")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features")
    parser.add_argument("ortholog_features_file_name", type = str, help = "name of GTF file with genome features for the orthologous genome")
    parser.add_argument("genome", type = str, help = "genome assembly name")
    parser.add_argument("ortholog_genome", type = str, help = "ortholog genome assembly name")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("ortholog_dataset_name", type = str, help = "ortholog dataset name")
    parser.add_argument("orthologs_file_name", type = str, help = "csv with orthologous pairs")
    parser.add_argument("dS_threshold", type = float, help = "csv with orthologus pair")
    parser.add_argument("alignment_folder", type = str, help = "folder where phy alignment files will be stored")
    parser.add_argument("raw_orth_seq_file", type = str, help = "file with the raw ortholog CDS sequences (downloaded via ensembl biomart)")

    args = parser.parse_args()
    [features_file_name, ortholog_features_file_name, genome, ortholog_genome, dataset_name, ortholog_dataset_name, orthologs_file_name, dS_threshold, alignment_folder, raw_orth_seq_file] = [args.features_file_name,
                                                                                                                                                          args.ortholog_features_file_name, args.genome, args.ortholog_genome,
                                                                                                                                                                                               args.dataset_name,
                                                                                                                                                                                               args.ortholog_dataset_name,
                                                                                                                                                                                               args.orthologs_file_name,
                                                                                                                                                                                               args.dS_threshold,
                                                                                                                                                                                               args.alignment_folder,
                                                                                                                                                                                               args.raw_orth_seq_file]
    make_dir(alignment_folder)
    trans_id_pattern = re.compile("ENS\w*T\d*")
    ids_to_keep = []
    #loop over an ensembl GTF file
    with open(features_file_name) as features_file:
        #skip the metadata
        for i in range(5):
            features_file.readline()
            
        for i in features_file:
            #only consider features that have been localized to chromosomes and that are from protein-coding genes
            if "PATCH" not in i and "gene_biotype \"protein_coding\"" in i and i[0] in "123456789XY" and i[1] in "0123456789XY\t":
                trans_id_obj = re.search(trans_id_pattern, i)
                if trans_id_obj:
                    trans_id = trans_id_obj.group(0)
                    #store the transcript ID
                    ids_to_keep.append(trans_id)

    #make a list of the unique transcript IDs you got in the previous step
    ids_to_keep = list(set(ids_to_keep))

    #create a feature set object from the transcript IDs,
    #that is to say, make a file that has all the associated gene feature annotations
    fs = Feature_Set(features_file_name, genome)
    #the dataset only needs to be created if it didn't exist previously
##    fs.create_dataset(dataset_name, input_list = ids_to_keep)
    fs.set_dataset(dataset_name)
    print("Created dataset with {0} transcripts.".format(len(fs.names)))
    #this file will have the mappings between genes from the focal species and genes from the orthologus species
    final_pairs_file_name = "general/{0}_{1}_pc_pairs.csv".format(genome, ortholog_genome)

    CDS = fs.get_CDS()
    CDS = {i: CDS[i] for i in CDS if CDS[i]}
    #write the full ORF sequences of the genes to FASTA, filtering based on reading frame integrity. Also check that
    #there are no premature termination codons.
    fs.write_full_CDS(CDS, check_ORF = True, bare_name = True, PTC_check = True)

    ids_to_keep = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))[0]

    print("{0} transcripts pass the check for ORF integrity.".format(len(ids_to_keep)))

    transcripts = fs.get_transcripts()
    transcripts = {i: transcripts[i] for i in ids_to_keep}

    #for genes with several associated transcript IDs, only keep the longest.   
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    ids_to_keep = []
    for gene in gene_name_dict:
        current_CDS = [CDS[j] for j in gene_name_dict[gene]]
        current_lengths = [sum([j[0][3] - j[0][2] + 1 for j in k]) for k in current_CDS]
        id_to_keep = gene_name_dict[gene][current_lengths.index(max(current_lengths))]
        ids_to_keep.append(id_to_keep)

    print("After only keeping one transcript per gene (the longest), {0} transcripts remain.".format(len(ids_to_keep)))

    #this is a file that has the orthologs of your gens from Ensmebl biomart
    orth_data = rw.read_many_fields(orthologs_file_name, ",")
    #make a dictionary for the gene-to-ortholog mapping
    pairs_dict = {}

    for line in orth_data:
        if line[1] not in pairs_dict:
            pairs_dict[line[1]] = []
        pairs_dict[line[1]].append(line[2])

    #only keep genes for which there is an ortholog in the comparator species
    #transcript identifiers
    ids_to_keep = [i for i in ids_to_keep if i in pairs_dict]

    #gene identifiers
    orth_ids_to_keep = list(pairs_dict.values())
    orth_ids_to_keep = list(set(flatten(orth_ids_to_keep)))

    #create a feature set for the other species based on the genes that are orthologous to the genes in your focal set
    orth_fs = Feature_Set(ortholog_features_file_name, ortholog_genome)
##    orth_fs.create_dataset(ortholog_dataset_name, input_list = orth_ids_to_keep, input_type = "gene")
    orth_fs.set_dataset(ortholog_dataset_name)
    orth_CDS = orth_fs.get_CDS()
    orth_CDS = {i: orth_CDS[i] for i in orth_CDS if orth_CDS[i]}
    #write the ortholog ORFs to FASTA. Filter based on reading frame integrity and PTC content.
    orth_fs.write_full_CDS(orth_CDS, check_ORF = True, bare_name = True, PTC_check = True)
    orth_full_CDS_file = "{0}_{1}_full_CDS.fasta".format(ortholog_features_file_name[:-4], ortholog_dataset_name)

    #in some cases, if the genome assembly for the ortholog is not very good, it can take forever to get the sequences using faidx.
    #In that case, you can get the sequences via biomart. Uncomment the code below!
##    rw.write_names(list(orth_CDS.keys()), "general/{0}_trans_IDs.txt".format(ortholog_dataset_name))
##    with open(raw_orth_seq_file) as file:
##        raw_orth_seq = "".join(file)
##    raw_orth_seq = re.sub("([A-Z])\n([A-Z])", "\\1\\2", raw_orth_seq)
##    raw_orth_seq = raw_orth_seq.split("\n")
##    raw_orth_seq = [i for i in raw_orth_seq if len(i) > 0]
##    raw_orth_names = [i for i in raw_orth_seq if i[0] == ">"]
##    raw_orth_seq = [i for i in raw_orth_seq if i[0] != ">"]

##    with open(orth_full_CDS_file, "w") as file:
##        for pos, seq in enumerate(raw_orth_seq):
##            ORF_check = check_ORF_integrity(seq, PTC_check = True)
##            if ORF_check[0]:
##                file.write("{0}\n".format(raw_orth_names[pos]))
##                file.write("{0}\n".format(seq))
##            else:
##                print(pos)
##                print(ORF_check[1])
##                print(raw_orth_names[pos])
##                print(seq)
##                print("\n")            

    #read in the full ORF sequences from both species
    CDS_names, CDS_seq = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))
    orth_CDS_names, orth_CDS_seq = rw.read_fasta(orth_full_CDS_file)

    orth_transcripts = orth_fs.get_transcripts()

    orth_gene_name_dict = orth_fs.get_gene_name_dict(orth_transcripts)

    final_pairs = {}
    counter = 0
    #loop over the remaining genes
    for i in ids_to_keep:
        if counter%1000 == 0:
            print(counter)
        counter = counter + 1
        #get the IDs of the orthologous genes in the ortholog species
        orth_ids = pairs_dict[i]
        #get all the associated transcript identifiers
        orth_ids_trans = [[orth_gene_name_dict[j][k] for k in range(len(orth_gene_name_dict[j]))] for j in orth_ids if j in orth_gene_name_dict]
        orth_ids_trans = flatten(orth_ids_trans)
        CDS = CDS_seq[CDS_names.index(i)]
        orth_CDS = []
        ids_to_remove = []
        #get all the ortholog ORF sequences
        for j in orth_ids_trans:
            try:
                current_CDS = orth_CDS_seq[orth_CDS_names.index(j)]
                orth_CDS.append(current_CDS)
            #this is because some of the transcripts produced from the gene might be non-coding or have a wonky ORF and therefore not appear in the CDS fasta
            except ValueError:
                ids_to_remove.append(j)
        orth_ids_trans = [j for j in orth_ids_trans if j not in ids_to_remove]
        #check that the sequence from the focal species aligns to an ortholog with dN/dS below 0.5 and dS below the specified threshold
        if orth_ids_trans:
            conservation_check = keep_conserved_pc(i, orth_ids_trans, CDS, orth_CDS, dS_threshold, alignment_folder)
            if conservation_check[0]:
                #also store which ortholog transcript gave the lowest dS in the alignment
                final_pairs[i] = conservation_check[1]
            
    print("After filtering by conservation, {0} transcripts remain.".format(len(list(final_pairs.values()))))
    #write the final retained ortholog gene pairs to file
    with open(final_pairs_file_name, "w") as file:
        output_writer = csv.writer(file, delimiter = ",")
        for i in final_pairs:
            output_writer.writerow([i, final_pairs[i]])

    print("Wrote ortholog pairs to {0}.".format(final_pairs_file_name))

    #write the remaining ORF sequences to fasta
    CDS_seq = [i for pos, i in enumerate(CDS_seq) if CDS_names[pos] in final_pairs]
    CDS_names = [i for i in CDS_names if i in final_pairs]
    rw.write_to_fasta(CDS_names, CDS_seq, "general/filtered_{0}_wo_low_omega.fasta".format(dataset_name))

    #create a feature set with the remaining genes
    filtered_fs = Feature_Set(features_file_name, genome)
    filtered_fs.create_dataset("filtered_{0}".format(dataset_name), input_list = list(final_pairs.keys()))
    print("All done.")
def main():

    description = "Calculate the combined density of a set of motif sets."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22])
    [motifs_file_name, summary_file_name, dataset_name,  correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name,  args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml]

    #make a dictionary with RBPs as keys and ND/p values as values.
    if summary_file_name != "None":
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        #because some of the files are tab-separated, while others are comma-separated and have a header row
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")
            summary_data = summary_data[1:]

        summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True)
            
    #make a dictionary with RBPs as keys and lists of associated motifs as values        
    motifs = rw.read_motifs(motifs_file_name)

    #if you only want to be using a subset of the motifs
    if not full_set:
        #which RBPs fulfill the necessary information content criteria?
        validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t")
        validity = list_to_dict(validity, 0, 1)
        #motifs with negative ND
        if negative_ND:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")]
        #the most significantly enriched motifs
        elif upper_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")]
        #the most significantly depleted motifs
        elif lower_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")]
        #motifs with positive ND
        else:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")]

    #shove all the remaining motifs into a great big flattened and uniquified bag
    motifs = list(set(flatten(list(motifs.values()))))

    make_dir(output_folder_name)

    #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers)
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        transcripts = fs.get_transcripts()
        CDS = fs.get_CDS()
        #paralogous families
        families = rw.read_families(families_file_name)
        #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers
        if gene_families:
            families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        #pick a random member from each paralogous family
        picked_trans = fs.pick_random_members()
        names = rw.read_fasta(fasta_name)[0]
        if picked_trans[0] not in names:
            picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans]
        else:
            picked = picked_trans
        print(len(picked))
    else:
        picked = None

    if baseml:
        method = "baseml"
    else:
        method = "gy"

    #write the input data for the conservation analysis into a file
    input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random())
    conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked)
    with open(output_file_name, "w") as file:
        file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"]))
        file.write("\n")
        #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria
        if new_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1)
        elif newer_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False)               
        else:
            simulants = nc.make_simulants(motifs, n_sim, seed = 100)
        #file where the simulants dS values will be stored
        sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix)
        #calculate dS within motifs and simulants
        output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method)
        print(output_dict)
        print("\n")
        #write to output file
        if output_dict != None:
            file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))]))
        else:
            file.write(",".join([str(None), str(None), str(None), str(None), str(None)]))
    os.remove(input_dict_file_name)
 def test_get_GC4(self):
     names, sequences = rw.read_fasta("tests/test_get_GC4_input.fasta")
     expected = [1.0, 1.0, 1.0, 0.5, 2/3]
     phases = [2, 0, 0, 1, 0]
     observed = [get_GC4(sequences[i], phases[i]) for i in range(len(sequences))]
     self.assertEqual(observed, expected)
def main():
    parser = argparse.ArgumentParser(description="Calculate the conservation level of a series of RBP motifs.")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("genome", type = str, help = "genome assembly name")
    parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs")
    parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs")
    parser.add_argument("fasta_file_name", type = str, help = "name of fasta file with the sequences")
    parser.add_argument("families_file_name", type = str, help = "name of file that contains families")
    parser.add_argument("output_file_name", type = str, help = "file for output data")
    parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores")
    parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments")
    parser.add_argument("n_sim", type = int, help = "number of simulants")
    parser.add_argument("--valid_file", nargs = "?", const = "False")
    parser.add_argument("--gene_families", action = "store_true", help = "does the families file use gene identifiers?")
    parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?")
    parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?")
    parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters but without concatenation and without allowing duplicates within simulant sets.")
    parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should Goldman & Yang's method be used for calculating dS?")
    parser.add_argument("--baseml", dest = "baseml", action = "store_true", help = "Should baseml be used instead of codeml?")
    args = parser.parse_args()
    [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_name, families_file_name, output_file_name, output_folder_name, alignment_folder_name, n_sim, valid_file, gene_families, markov, new_filters, newer_filters, goldman_yang, baseml] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.valid_file, args.gene_families, args.markov, args.new_filters, args.newer_filters, args.goldman_yang, args.baseml]   

    #pick a random member from each paralogous family
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        families = rw.read_families(families_file_name)
        #if the families file uses gene identifiers rather than transcript identifiers
        if gene_families:
            families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        picked_trans = fs.pick_random_members()
        #if the fasta uses gene identifiers but the feature set uses transcript identifiers
        names = rw.read_fasta(fasta_file_name)[0]
        if picked_trans[0] not in names:
            transcripts = fs.get_transcripts()
            picked = []
            for i in picked_trans:
                picked.append(fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG"))
        else:
            picked = picked_trans
        print(len(picked))
    else:
        picked = None

    motif_dict = rw.read_motifs(RBP_file_name)

    #valid_file says which proteins pass information content criteria. Only analyze the ones that do.
    if not valid_file:
        validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t")
        validity = list_to_dict(validity, 0, 1)
    elif valid_file == "None":
        validity = {i: "True" for i in motif_dict}
    else:
        validity = rw.read_many_fields(valid_file, "\t")        
        validity = list_to_dict(validity, 0, 1)
    protein_names = sorted([name for name in list(motif_dict.keys()) if validity[name] == "True"])

    #whether to use PAML codeml or yn00.
    if baseml:
        method = "baseml"
    elif goldman_yang:
        method = "gy"
    else:
        method = "yn"

    #write the input data for the conservation analysis to file
    input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random())
    conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_file_name, input_dict_file_name, picked = picked)
    with open(output_file_name, "w") as file:
        file.write(",".join(["protein_name", "real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"]))
        file.write("\n")
        for protein in protein_names:
            print(protein)
            motifs = motif_dict[protein]
            #use one of several different methods to generate simulant motifs
            if markov:
                simulants = nc.make_simulants_markov(motifs, n_sim)
            elif new_filters:
                simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True)
            elif newer_filters:
                simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False, seed = 1)               
            else:
                simulants = nc.make_simulants(motifs, n_sim)
            sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, protein)
            #determine the conservation parameters of the current protein
            output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method)
            print(output_dict)
            print("\n")
            if output_dict != None:
                file.write(",".join([protein, str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))]))
            else:
                file.write(",".join([protein, str(None), str(None), str(None), str(None), str(None)]))
            file.write("\n")
    os.remove(input_dict_file_name)
def main():

    parser = argparse.ArgumentParser(description="Calculate the density of a series of RBP motifs in any type of sequence.")
    parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs")
    parser.add_argument("output_folder_name", type = str, help = "name of folder that will contain analysis results")
    parser.add_argument("output_file_name", type = str, help = "name of file that will contain analysis results")
    parser.add_argument("input_file_name", type = str, help = "name of fasta file with the sequences")
    parser.add_argument("n_sim", type = int, help = "number of simulants")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file")
    parser.add_argument("genome", type = str, help = "genome name")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("families_file_name", type = str, help = "families file name")
    parser.add_argument("--simulants_within", dest = "simulants_within", action = "store_true", help = "Should simulants be generated only from dinucleotides within each particular motif?")
    parser.add_argument("--sequence_control", dest = "sequence_control", action = "store_true", help = "Should shuffled sequences be used as control?")
    parser.add_argument("--remove_stops", dest = "remove_stops", action = "store_true", help = "Should simulant motifs not incldue motifs that contain stop codon sequences? (boolean)")
    parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?")
    parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?")
    parser.add_argument("--no_concat", dest = "no_concat", action = "store_true", help = "Should a density be calculated for each gene?")
    parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters, but also not allowing duplicates in the simulants and without concatenation.")
    parser.add_argument("--two_seqs", dest = "two_seqs", action = "store_true", help = "Set to true if the sequence fasta has two sequences separated by a pipe in each line.")

    args = parser.parse_args()
    [RBP_file_name, output_folder_name, output_file_name, input_file_name, n_sim, features_file_name, genome, dataset_name, families_file_name, simulants_within, sequence_control, remove_stops, markov, new_filters, no_concat, newer_filters, two_seqs] = [args.RBP_file_name,
                                                                                                                                                                    args.output_folder_name,
                                                                                                                                                                    args.output_file_name,
                                                                                                                                                                    args.input_file_name,
                                                                                                                                                                    args.n_sim,
                                                                                                                                                                    args.features_file_name,
                                                                                                                                                                    args.genome,
                                                                                                                                                                    args.dataset_name,
                                                                                                                                                                    args.families_file_name,
                                                                                                                                                                                      args.simulants_within,
                                                                                                                                                                                    args.sequence_control,
                                                                                                                                                                                                     args.remove_stops,
                                                                                                                                                                                                             args.markov,
                                                                                                                                                                                                                          args.new_filters,
                                                                                                                                                                                                                                     args.no_concat,
                                                                                                                                                                                                                                                    args.newer_filters,
                                                                                                                                                                                                                                                              args.two_seqs]   
    make_dir(output_folder_name)

    #if you want to average over families
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        families = rw.read_families(families_file_name)
        fs.add_families(families)
    else:
        fs = None

    #if concat, sum motif hit base counts across sequences and divide by the total sequence length,
    #otherwise produce a density estimate separately for each sequence and use the median as the final statistic
    if no_concat:
        concat = False
    else:
        concat = True

    #read in RBP motifs
    RBPs, motifs = rw.read_fasta(RBP_file_name)
    with open(output_file_name, "w") as output_file:
        for pos, RBP in enumerate(RBPs):
            curr_motifs = motifs[pos].split("|")
            #if, as control, you want to shuffle the codons within sequences
            if sequence_control:
                current_simulants = 3
                output_suffix = "_sequence_control"
            #if, as control, you want to calculate the density of simulant motifs
            else:
                #generate simulant motifs, applying different sets of filters onto the simulant motifs
                output_suffix = ""
                if simulants_within:
                    current_simulants = nc.make_simulants_within(curr_motifs, n_sim)
                elif markov:
                    current_simulants = nc.make_simulants_markov(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True)
                elif new_filters:
                    current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True, cap_runs = True)
                elif newer_filters:
                    current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False)                   
                else:
                    current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops)
            #get raw density, normalized density, p, Z... for current RBP
            current_dict = nc.get_sequence_set_density(input_file_name, None, curr_motifs, current_simulants, n_sim,
                                                       "{0}/{1}_{2}_density.csv".format(output_folder_name, RBP, output_suffix),
                                                       "{0}/{1}_{2}_sim_density.csv".format(output_folder_name, RBP, output_suffix),
                                                       "{0}/{1}_{2}_positions.csv".format(output_folder_name, RBP, output_suffix),
                                                       "{0}/{1}_{2}_sim_positions".format(output_folder_name, RBP, output_suffix),
                                                       concat = concat, positions = False, feature_set = fs, two_seqs = two_seqs)
            if concat:
                current_record = [RBP, str(current_dict["density"]), str(np.mean(current_dict["simulated densities"])), str(current_dict["ND"]), str(current_dict["effective p"]), str(current_dict["Z"]), str(current_dict["depletion p"]), str(len(curr_motifs)), str(current_dict["simulant sd"])]
            else:
                current_record = [RBP, str(current_dict["median density"]), str(np.mean(current_dict["simulated densities"])), str(current_dict["median ND"]), str(current_dict["effective p"]), str(current_dict["Z"]), str(current_dict["depletion p"]), str(len(curr_motifs)), str(current_dict["simulant sd"])]
            output_file.write("\t".join(current_record))
            output_file.write("\n")
            print(current_record)
Exemple #9
0
def main():
    description = "Run INSIGHT on a set of sequences and a set of sites."
    args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "suffix", "dataset", "output_folder", "freq_threshold", "n", "hit_file", "control_file", "SNP_file_name_prefix", "CDS_SNP_file_name_prefix", "MSA_file_name_prefix", "trial_file", "trials", "hit_degen_file", "control_degen_file", "hit_reduce", "control_reduce", "new_SNPs", "new_MSA", "shuffle", "nonsyn_hits", "remove_GT", "big_tree"], floats = [7, 18, 19], ints = [8, 15], flags = [20, 21, 22, 23, 24, 25])
    fasta, genome, features_file, families_file, suffix, dataset, general_output_folder, freq_threshold, n, hit_file, control_file, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, trial_file, trials, hit_degen_file, control_degen_file, hit_reduce, control_reduce, new_SNPs, new_MSA, shuffle, nonsyn_hits, remove_GT, big_tree = args.fasta, args.genome, args.features_file, args.families_file, args.suffix, args.dataset, args.output_folder, args.freq_threshold, args.n, args.hit_file, args.control_file, args.SNP_file_name_prefix, args.CDS_SNP_file_name_prefix, args.MSA_file_name_prefix, args.trial_file, args.trials, args.hit_degen_file, args.control_degen_file, args.hit_reduce, args.control_reduce, args.new_SNPs, args.new_MSA, args.shuffle, args.nonsyn_hits, args.remove_GT, args.big_tree
    output_folder = "{0}/{1}_{2}".format(general_output_folder, dataset, suffix)

    names, seqs = rw.read_fasta(fasta)

    #prepare feature set and family information
    fs = Feature_Set(features_file, genome)
    fs.set_dataset(dataset)
    if families_file == "None":
        conservation.find_families(fasta, "general/{0}".format(dataset))
        families_file = "general/{0}_families.txt".format(dataset)
    families = rw.read_families(families_file)
    fs.add_families(families)

    make_dir(output_folder)

    general_folder = "DFE/for_everybody"
    make_dir(general_folder)
    if MSA_file_name_prefix == "None":
        MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset)

    #read in degeneracy information
    if hit_degen_file != "None":
        degen_hits = parse_degen(hit_degen_file)
        degen_controls = parse_degen(control_degen_file)
    else:
        degen_hits = None
        degen_controls = None

    #get relevant genome features
    transcripts = fs.get_transcripts()
    CDSs = fs.get_CDS()
    lengths = fs.get_lengths(CDSs, CDS = True)
    #filter out sex chromosomes from the analysis
    sex_chromosomes = ["X", "Y"]
    chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes}
    chroms = list(set(list(chrom_dict.values())))

    clean_names = ["h**o", "pan", "pongo", "macaca"]

    #if you're running several trials
    #if just one, it'll still make a single trial file
    if trial_file == "None":
        trial_file = "{0}_{1}_{2}.txt".format(trial_file, suffix, trials)
        

    with open(trial_file, "w") as o_file:
        print(suffix)
        #output file header
        o_file.write("rho\teta\tgamma\tDp\tPw\talpha\ttau\trhose\tetase\tgammase\trholl\tetall\tgammall\n")
        for trial in range(trials):
            print("==========TRIAL {0}==========\n".format(trial))


            #get INSIGHT input data as a string based on divergence and SNP data
            hit_output, neutral_output, chroms_to_keep, hit_counts, control_counts = get_MSA(chroms, chrom_dict, control_file, hit_file, CDSs, lengths, names, seqs, clean_names, freq_threshold, dataset, suffix, genome, output_folder, general_folder, n, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, new_SNPs, new_MSA, shuffle, remove_GT, big_tree, hit_reduce = hit_reduce, control_reduce = control_reduce,  degen_hits = degen_hits, degen_controls = degen_controls)

            print("Writing output files...")
            neutral_output_file = "{0}/{1}_{2}_{3}_neutral_input.txt".format(output_folder, dataset, suffix, trial)
            hit_output_file = "{0}/{1}_{2}_{3}_hit_input.txt".format(output_folder, dataset, suffix, trial)
            write_output_file(neutral_output_file, neutral_output, n)
            write_output_file(hit_output_file, hit_output, n)

            print("Running INSIGHT...")
            conservation.INSIGHT(neutral_output_file, hit_output_file, freq_threshold, "../Software/INSIGHT", "{0}_{1}".format(dataset, suffix))

            print("Counting positions on chromosomes...")
            with open("{0}/{1}_{2}_pos_per_chrom.csv".format(output_folder, dataset, suffix), "w") as file:
                file.write("chrom\thits\tcontrols\n")
                for chrom in sorted(chroms_to_keep):
                    file.write("{0}\t{1}\t{2}\n".format(chrom, hit_counts[chrom], control_counts[chrom]))

            INSIGHT_output = "../Software/INSIGHT/{0}_{1}.ins.log".format(dataset, suffix)
            #parse the INSIGHT output and do simple significance testing
            try:
                parsed_output = parse_INSIGHT_output(INSIGHT_output)
                estimates = parsed_output["estimates"]
                SE = parsed_output["SEs"]
                lls = parsed_output["chi_sq"]

                print("\n")
                print("Chisq statistics: {0}".format(" ".join([str(i) for i in lls])))
                rho_pL = scipy.stats.chi2.sf(lls[0], 3)
                print("pL(rho): {0}".format(rho_pL))
                eta_pL = scipy.stats.chi2.sf(lls[1], 1)
                print("pL(eta): {0}".format(eta_pL))
                gamma_pL = scipy.stats.chi2.sf(lls[2], 1)
                print("pL(gamma): {0}".format(gamma_pL))
                
                lls = "\t".join([str(i) for i in lls])
                estimates = "\t".join(estimates)
                SE = "\t".join(SE)
                o_file.write(estimates)
                o_file.write("\t")
                o_file.write(SE)
                o_file.write("\t")
                o_file.write(lls)
                o_file.write("\n")
            #skip trials where INSIGHT failed to produce a full output
            except IndexError:
                print("Skipping...")
                pass
Exemple #10
0
def main():

    description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs."
    args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9])
    fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral

    names, seqs = rw.read_fasta(fasta)

    #I use two different formats for storing sequence motifs,
    #got to know which on it is
    if old_motif_format:
        motifs = rw.read_names(motif_file)[1:]
        print(len(motifs))
    else:
        motifs = rw.read_motifs(motif_file)
        motifs = sorted(list(set(flatten(list(motifs.values())))))

    #get the lengths of the motifs and compile lookahead regexes
    #that recognize the whole motif but only store the position of the first bases
    #these will be needed when searchin for the motifs
    motif_lengths = [len(i) for i in motifs]
    motif_regex = nc.motif_to_regex(motifs)

    #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say,
    #ESE motifs
    CG_2mers = ["CG", "GC"]
    CG_lengths = [2, 2]
    CG_regex = nc.motif_to_regex(CG_2mers)

    motifs = [list(i) for i in motifs]

    if ancestral:
        anc_pos = rw.read_pos(anc_file)

    #read in hit and control positions
    controls = rw.read_pos(control_file)
    hit_file = re.sub("controls", "hits", control_file)
    hits = rw.read_pos(hit_file)

    #read in SNP data
    SNPs = rw.read_many_fields(SNPs_file, "\t")
    #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data
    to_remove = list_to_dict(SNPs, 0, 2)
    to_remove = {i: to_remove[i].split(",") for i in to_remove}
    to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove}
    SNPs = list_to_dict(SNPs, 0, 1)

    #all the SNPs associated to a transcript
    full_SNPs = {}
    #disruptive SNPs only
    clean_SNPs = {}
    minor_alleles = {}

    #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions)
    transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}
    #the same as above but only counting those substitutions that would turn a motif into a non-motif
    transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}

    #this block of code filters the true SNPs to only leave those that are disruptive
    #and also calculates the probability of being disruptive for all potential SNPs
    with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file:
        counter = 0
        for trans in names:
            counter = update_counter(counter, 1000)
            if trans in controls:
                if trans in SNPs:
                    trans_SNPs = SNPs[trans]
                else:
                    trans_SNPs = []
                trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans)
                current_seq = seqs[names.index(trans)]
                fourfold_pos = nc.get_4fold_deg(current_seq)
                #CpG filtering
                if human:
                    CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"]
                    fourfold_pos = [i for i in fourfold_pos if i not in CG_pos]
                if ancestral:
                    fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]]
                all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove)
                hit_degen_file.write("\n")

    to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls}

    hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N)

    transitions = get_transitions(transitions_disr, transitions_total)
    print(transitions)

    #this block randomly assigns certain SNPs at simulant positions to be disruptive,
    #with the probability of that happening proportional to the frequency with which potential substitutions
    #of that nucleotide composition would be disruptive for true (motif) sites
    with open("{0}_degen.txt".format(control_file), "w") as control_degen_file:
        control_SNPs = {}
        counter = 0
        for trans in controls:
            control_degen_file.write("{0}\t".format(trans))
            counter = update_counter(counter, 1000)
            control_SNPs[trans] = {}
            trans_SNPs = full_SNPs[trans]
            current_seq = seqs[names.index(trans)]
            for site in controls[trans]:
                if trans not in to_remove or site not in to_remove[trans]:
                    ref_allele = current_seq[site]
                    disrupt_bases = get_disrupt_bases(ref_allele, transitions)
                    control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases)))
                    if site in trans_SNPs:
                        minor_allele = minor_alleles[trans][site]
                        if minor_allele in disrupt_bases:
                            control_SNPs[trans][site] = trans_SNPs[site]
            control_degen_file.write("\n")

    control_SFS = get_SFS(controls, control_SNPs, to_remove, N)

    with open(output_file, "w") as file:
        file.write("{0}\n".format(N))
        file.write(" ".join([str(i) for i in hit_SFS]))
        file.write("\n")
        file.write(" ".join([str(i) for i in control_SFS]))
        file.write("\n")    
def main():

    description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14])
    [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name,  dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP]

    RBPs = rw.read_motifs(motifs_file_name)

    #only leave those RBPs hat pass information content criteria
    validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t")
    validity = list_to_dict(validity, 0, 1)
    RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"}

    #if you're not doing this by RBP, pool motifs from the most significantly depleted sets
    if not by_RBP:
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")    
        summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True)            
        RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9}
        motifs = list(set(flatten(list(RBPs.values()))))
        RBPs = {"all": motifs}

    #randomly pick one gene from each paralogous family
    fs = Feature_Set(features_file_name, genome)
    fs.set_dataset(dataset_name)
    transcripts = fs.get_transcripts()
    families = rw.read_families(families_file_name)
    families = fs.convert_families_to_ENST(families, transcripts)
    fs.add_families(families)
    picked_from_families = fs.pick_random_members()
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families]

    names, CDS = rw.read_fasta(sequences_file_name)

    #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species
    correspondances = rw.read_many_fields(correspondances_file_name, ",")
    correspondance_dict = {}
    for i in correspondances:
        correspondance_dict[i[0]] = i[1]

    output_dict = {}

    #loop over the RBPs
    for protein in sorted(RBPs):

        #fetch the current motifs
        print(protein)
        motifs = RBPs[protein]
        print("There are {0} motifs.".format(len(motifs)))
        #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set
        neighbours = nc.get_neighbours(motifs)
        print("There are {0} neighbours.".format(len(neighbours)))            

        #make simulants for the motifs. don't allow simulants to be part of the set of neighbours.
        simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False)

        neighbour_lengths = [len(i) for i in neighbours]        
        neighbours = nc.motif_to_regex(neighbours)

        #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that
        #would give rise to the motif in the orthologous species
        site_number = 0
        mutation_score = 0
        motifs = [list(i) for i in motifs]
        true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) 
        for i in true_result:
            current = i.get()
            site_number = site_number + current[0]
            mutation_score = mutation_score + current[1]
        if site_number > 0:
            real_fraction = mutation_score/site_number
        else:
            real_fraction = None
        print("Real fraction:")
        print(real_fraction)

        neighbours = ""      
        sim_site_numbers = np.zeros((n_sim))
        sim_mutation_scores = np.zeros((n_sim))

        #obtain this estimate also for each simulant set
        #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once
        for sim in range(n_sim):
            if sim%10 == 0:
                print(sim)
            current_simulants = simulants[sim]
            current_neighbours = nc.get_neighbours(current_simulants)
            current_neighbour_lengths = [len(i) for i in current_neighbours]        
            current_neighbours = nc.motif_to_regex(current_neighbours)
            current_simulants = [list(i) for i in current_simulants]
            current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif)
            for i in current_result:
                current = i.get()
                sim_site_numbers[sim] = sim_site_numbers[sim] + current[0]
                sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1]

        #normalize the real fraction, calculate p
        sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers)
        sim_fractions = [i for i in sim_fractions if i != np.inf]
        p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False)
        norm_fraction = ms.normalize(real_fraction, sim_fractions) 

        output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction]
        print(output_dict[protein])
        
    with open(output_file_name, "w") as output_file:
        #write header to output file
        output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n")
        #write the rest of the output data
        for protein in sorted(list(output_dict.keys())):
            to_write = output_dict[protein]
            to_write = [str(i) for i in to_write]
            output_file.write("\t".join(to_write))
            output_file.write("\n")
Exemple #12
0
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False):
    '''
    Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque
    and one which positions were likely CpG/GpC in the human-macaque ancestor.
    '''
    names, seqs = rw.read_fasta(fasta)
    #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file
    #if you want to have the name of the file determined automatically
    if (not anc_CG_file_name) or (anc_CG_file_name == "None"):
        new_CG = True
        phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random())
    #if you want to give the file a name yourself
    elif not os.path.exists(anc_CG_file_name):
        new_CG = True
    else:
        new_CG = False

    if new_CG:
        print("Will get new CpG data...")
        if len(phylip_data) < 8 and comprehensive:
            print("Comprehensive CpG filtering only in big tree mode!")
            raise Exception
        #if you want to pretend some other dinucleotide are CpG
        if pseudoCG:
            CG_kmers = ["C[\-]*T", "A[\-]*G"]
        #the hyphens are there in case the two nucleotides are separated by an indel
        else:
            CG_kmers = ["C[\-]*G", "G[\-]*C"]
        CG_kmers = [re.compile(i) for i in CG_kmers]
        macaque_CG_dict = {}

        anc_CG_concat_full = [[[""]], [[""]]]
        tuples_mapping_dict_full = {}

        for chrom in chroms:

            print(chrom)

            #only leave those CDSs that are on the current chromosome
            current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom}
            coords_file = "temp_data/coords_file{0}.txt".format(random.random())

            #check if the MSA is already at the specified location, otherwise retrieve it
            MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom)
            if not os.path.isfile(MSA_file):
                print("Obtaining MSA...")
                eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file)
                os.remove(coords_file)
                eo.flush_tables("localhost", "mysql", "fackel")
            MSA_raw = eo.parse_MSA_output(MSA_file)
            if high_CG_file_name != "None":
                high_CG = rw.read_many_fields(high_CG_file_name, "\t")
                high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG}
            else:
                high_CG = None
            #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome
            anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model)
            remove_file(coords_file)
            #add that information to the global dictionaries
            anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict)
            
        phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict)
        pp_file = anc_CG_file_name

    else:
        print("Will read in existing CpG data...")
        pp_file = None
        phy_files = "None"
        high_CG = None
        tuples_mapping_dict_full = None
        macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4])
        macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t")
        macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2]
        macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1)
        macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict}
    anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive)
    [remove_file(i) for i in phy_files]
    #if you're looking at exon cores/flanks rather than full CDSs
    if regions:
        #you need to have matching bed/fasta files for this to work (with the records in the same order)
        bed = fasta.replace("fasta", "bed")
        transcripts = fs.get_transcripts()
        #for each flank/core, figure out what positions it covers in the full CDS
        mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True)
        anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict)
    if return_tuples:
        return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full)
    else:
        return(anc_CG_dict, macaque_CG_dict)