コード例 #1
0
 def test_sort_bed(self):
     infile = "tests/sort_bed_input.bed"
     expected_file = "tests/sort_bed_expected.bed"
     observed_file = "tests/sort_bed_observed.bed"
     hk.remove_file(observed_file)
     sort_bed(infile, observed_file)
     expected = rw.read_many_fields(expected_file, "\t")
     observed = rw.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
コード例 #2
0
 def test_intersect_bed_no_dups(self):
     A_file = "tests/intersect_bed_no_dups_input_A.bed"
     B_file = "tests/intersect_bed_no_dups_input_B.bed"
     expected_file = "tests/intersect_bed_no_dups_expected.bed"
     observed_file = "tests/intersect_bed_no_dups_observed.bed"
     hk.remove_file(observed_file)
     intersect_bed(A_file, B_file, output_file=observed_file, no_dups=True)
     expected = rw.read_many_fields(expected_file, "\t")
     observed = rw.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(description="Calculate the conservation of a series of RBP motifs in exon cores and flanks.")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("genome", type = str, help = "genome assembly name")
    parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs")
    parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs")
    parser.add_argument("fasta_file_prefix", type = str, help = "prefix for fasta files with the sequences")
    parser.add_argument("output_file_name", type = str, help = "file for output data")
    parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores")
    parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments")
    parser.add_argument("n_sim", type = int, help = "number of simulants")
    parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?")
    parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the sampling method but removing existing motifs and capping mononucleotide runs?")
    parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should the Goldman & Yang method (rather tahn Yang & Nielsen) be used for calculating dS?")
    parser.add_argument("--validity", dest = "validity", action = "store_true", help = "Should RBPs be filtered based on information content?")

    args = parser.parse_args()
    [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_prefix, output_file_name, output_folder_name, alignment_folder_name, n_sim, markov, new_filters, goldman_yang, validity] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_prefix, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.markov, args.new_filters, args.goldman_yang, args.validity]   

    #make dictionary with RBPs as keys and lists of associated motifs as values
    motif_dict = rw.read_motifs(RBP_file_name)

    uf_fasta = "{0}_uf.fasta".format(fasta_file_prefix)
    df_fasta = "{0}_df.fasta".format(fasta_file_prefix)
    c_fasta = "{0}_c.fasta".format(fasta_file_prefix)

    fs = Feature_Set(features_file_name, genome)
    fs.set_dataset(dataset_name)
    transcripts = fs.get_transcripts()
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    CDS = fs.get_CDS()

    #prepare the dictionary that is going to be necessray for mapping between exonic subregions and full CDSs
    regions_dict = {}
    regions_dict["gene name dict"] = gene_name_dict
    regions_dict["CDS"] = CDS
    regions_bed_file_names = [i[:-6] + ".bed" for i in [uf_fasta, c_fasta, df_fasta]]
    regions_dict["regions bed file"] = regions_bed_file_names
    regions_dict["fastas"] = [uf_fasta, c_fasta, df_fasta]

    #leave only those RBPs that pass the information content cutoff
    if validity:
        validity_5 = rw.read_many_fields("{0}/sufficient_information_fraction05_fiveprime.csv".format(output_folder_name), "\t")
        validity_core = rw.read_many_fields("{0}/sufficient_information_fraction05_core.csv".format(output_folder_name), "\t")
        validity_3 = rw.read_many_fields("{0}/sufficient_information_fraction05_threeprime.csv".format(output_folder_name), "\t")
        validity_5 = list_to_dict(validity_5, 0, 1)
        validity_core = list_to_dict(validity_core, 0, 1)
        validity_3 = list_to_dict(validity_3, 0, 1)
        protein_names = sorted([name for name in list(motif_dict.keys()) if validity_5[name] == "True" and validity_3[name] == "True" and validity_core[name] == "True"])
    else:
        protein_names = sorted(list(motif_dict.keys())) 

    #run conservation analysis
    do_dS_calc(protein_names, motif_dict, uf_fasta, df_fasta, c_fasta, n_sim, output_folder_name, correspondances_file_name, alignment_folder_name, output_file_name, regions_dict, markov, new_filters, goldman_yang)
コード例 #4
0
 def test_intersect_bed_hit_count_unsorted(self):
     A_file = "tests/intersect_bed_hit_count_unsorted_input_A.bed"
     B_file = "tests/intersect_bed_hit_count_unsorted_input_B.bed"
     expected_file = "tests/intersect_bed_hit_count_unsorted_expected.bed"
     observed_file = "tests/intersect_bed_hit_count_unsorted_observed.bed"
     hk.remove_file(observed_file)
     intersect_bed(A_file,
                   B_file,
                   output_file=observed_file,
                   hit_count=True,
                   no_dups=False)
     expected = rw.read_many_fields(expected_file, "\t")
     observed = rw.read_many_fields(observed_file, "\t")
     self.assertEqual(expected, observed)
コード例 #5
0
def get_pp(outroot, subst_model, phy_file, model_file, separate_to_concat_mapping, combined_dict, tuples_mapping, min_inf = None, parse_output = True):
    '''
    Get prior probabilities for all the bases at the different sites in an MSA. Note that for phyloFit these
    are posterior probabilities but theyare priors for INSIGHT.
    '''
    #you don't want to compute a tree, just get the posterior probabilities for an existing tree
    #hence all the flags from --post_probs onwards
    arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model,
                           "--msa-format", "PHYLIP", "--post-probs", "--scale-only", "--no-rates", "--no-freqs", phy_file]
    if min_inf:
        arguments.extend(["-I", min_inf])
    results = run_process(arguments)
    #parse into convenient dictionary
    if parse_output:
        pp_file = "{0}.postprob".format(outroot)
        pp = rw.read_many_fields(pp_file, " ")
        pp = [[j for j in i if j] for i in pp]
        #the outgroup nodes are labelled from the inside out, starting from 1
        pp = {i[1]: i[-4:] for i in pp}
        pp_final = {}
        #map from coordinates in the concatenated alignment to positions in individual CDSs
        for trans in separate_to_concat_mapping:
            pp_final[trans] = {}
            for position in combined_dict[trans]:
                pp_final[trans][position] = pp[tuples_mapping[separate_to_concat_mapping[trans][position]]]
        return(pp_final)
コード例 #6
0
def main():

    description = "Record the distribution of peaks for different exons."
    args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8])
    peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode

    if noncoding:
        exons = rw.read_gtf(gtf, "exon", gene=False)
    else:
        exons = rw.read_gtf(gtf, "CDS", gene=False)

    # the 3' ss that will be analyzed
    valid_junctions = rw.read_many_fields(exon_starts_file, "\t")
    # pull out the column with transcript IDs
    valid_junctions = [i[3] for i in valid_junctions]

    lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic)
    if nts_before_start:
        lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict}

    coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4])
    co.get_coverage(exon_starts_file, reads_file, coverage_file_name)

    peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode)

    write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None)

    write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict,
                                                               "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
コード例 #7
0
def get_data(file):
    '''
    Read in polymorphism data from an INSIGHT input file.
    '''
    data = rw.read_many_fields(file, "\t")
    data = [i for i in data if i[0] == "site"]
    return (data)
コード例 #8
0
def parse_basinhoppin_pos(file):
    '''
    Parse hit/control positions.
    '''
    #not used in main() in the present script but imported into other scripts
    #this is ugly, this function needs to move
    positions = rw.read_many_fields(file, "\t")
    positions = [[i[0], [int(j) for j in i[1].split(",") if j]] for i in positions]
    positions = list_to_dict(positions, 0, 1)
    return(positions)
コード例 #9
0
def parse_pos(file):
    '''
    Parse a hits/controls positions file.
    '''
    pos_list = rw.read_many_fields(file, "\t")
    pos_dict = list_to_dict(pos_list, 0, 1)
    pos_dict = {
        i: [int(j) for j in pos_dict[i].split(",") if j != ""]
        for i in pos_dict
    }
    return (pos_dict)
コード例 #10
0
def parse_degen(file_name):
    '''
    Parse a degenracy file into a nice dictionary with transcript IDs as keys.
    '''
    degen = rw.read_many_fields(file_name, "\t")
    degen = list_to_dict(degen, 0, 1)
    degen = {i: degen[i].split(",") for i in degen}
    for trans in degen:
        separate = [i.split(":") for i in degen[trans]]
        separate = [i for i in separate if len(i) == 2]
        degen[trans] = {int(i[0]): i[1].split("|") for i in separate}
    return(degen)
コード例 #11
0
def get_mean_freq(SFS_file):
    '''
    Use a Mann-Whitney U-test to compare MAFs in hits vs controls.
    '''
    SFS = rw.read_many_fields(SFS_file, " ")
    n = int(SFS[0][0])
    hit_freqs = flatten([[i / n for j in range(int(SFS[1][i]))]
                         for i in range(1, len(SFS[1]))])
    control_freqs = flatten([[i / n for j in range(int(SFS[2][i]))]
                             for i in range(1, len(SFS[2]))])
    mwu = scipy.stats.mannwhitneyu(control_freqs, hit_freqs)
    hit_median = np.median(hit_freqs)
    control_median = np.median(control_freqs)
    print("Median MAF in hits: {0}.".format(hit_median))
    print("Median MAF in controls: {0}.".format(control_median))
    print("MWU p: {0}.".format(mwu[1]))
    print("Difference: {0}.\n".format(hit_median - control_median))
    return ([hit_freqs, control_freqs], hit_median - control_median)
コード例 #12
0
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None,
                             force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None,
                  write_zero = False, write_bed = False, exclude = False):
    '''Use bedtools/bedops to intersect coordinates from two bed files.
    Return those lines in bed file 1 that overlap with intervals in bed file 2.
    OPTIONS
    output_file: write output to this file
    use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below.
    overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the
    overlap has to be at least 80% of the intervals in bed file 1).
    overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as
    the threshold indicated in -f.
    write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the
    interval from bed file 2 that it overlaps (only
    valid when using bedtools).
    exclude: if True, report intervals that DON'T overlap
    sort: sort bed files before taking the intersection
    force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools)
    force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands
    no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools)
    no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in
    bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2)
    chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency)
    intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2.
    hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools)
    intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first
    write_zero: like write_both but also write A intervals that don't overlap with any B intervals,
    write_bed: when intersecting a bam file, write output as bed.'''
    if force_strand and force_opposite_strand:
        raise Exception("force_strand and force_opposite_strand can't both be True")
    hk.make_dir("temp_data/")
    temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random())
    #have it write the output to a temporary file
    if use_bedops:
        bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec)
    else:
        bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude)
    #move it to a permanent location only if you want to keep it
    if output_file:
        hk.run_process(["mv", temp_file_name, output_file])
    else:
        bedtools_output = rw.read_many_fields(temp_file_name, "\t")
    hk.remove_file(temp_file_name)
    return(bedtools_output)
コード例 #13
0
def main():

    description = "Calculate the combined density of a set of motif sets."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22])
    [motifs_file_name, summary_file_name, dataset_name,  correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name,  args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml]

    #make a dictionary with RBPs as keys and ND/p values as values.
    if summary_file_name != "None":
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        #because some of the files are tab-separated, while others are comma-separated and have a header row
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")
            summary_data = summary_data[1:]

        summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True)
            
    #make a dictionary with RBPs as keys and lists of associated motifs as values        
    motifs = rw.read_motifs(motifs_file_name)

    #if you only want to be using a subset of the motifs
    if not full_set:
        #which RBPs fulfill the necessary information content criteria?
        validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t")
        validity = list_to_dict(validity, 0, 1)
        #motifs with negative ND
        if negative_ND:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")]
        #the most significantly enriched motifs
        elif upper_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")]
        #the most significantly depleted motifs
        elif lower_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")]
        #motifs with positive ND
        else:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")]

    #shove all the remaining motifs into a great big flattened and uniquified bag
    motifs = list(set(flatten(list(motifs.values()))))

    make_dir(output_folder_name)

    #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers)
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        transcripts = fs.get_transcripts()
        CDS = fs.get_CDS()
        #paralogous families
        families = rw.read_families(families_file_name)
        #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers
        if gene_families:
            families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        #pick a random member from each paralogous family
        picked_trans = fs.pick_random_members()
        names = rw.read_fasta(fasta_name)[0]
        if picked_trans[0] not in names:
            picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans]
        else:
            picked = picked_trans
        print(len(picked))
    else:
        picked = None

    if baseml:
        method = "baseml"
    else:
        method = "gy"

    #write the input data for the conservation analysis into a file
    input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random())
    conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked)
    with open(output_file_name, "w") as file:
        file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"]))
        file.write("\n")
        #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria
        if new_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1)
        elif newer_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False)               
        else:
            simulants = nc.make_simulants(motifs, n_sim, seed = 100)
        #file where the simulants dS values will be stored
        sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix)
        #calculate dS within motifs and simulants
        output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method)
        print(output_dict)
        print("\n")
        #write to output file
        if output_dict != None:
            file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))]))
        else:
            file.write(",".join([str(None), str(None), str(None), str(None), str(None)]))
    os.remove(input_dict_file_name)
コード例 #14
0
def main():

    description = "Calculate the combined density of a set of motif sets."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "seed", "output_suffix", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "newer_filters", "two_seqs"], ints = [5, 10, 11], flags = [13, 14, 15, 16, 17, 18, 19])
    [motifs_file_name, summary_file_name, dataset_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, seed, output_suffix, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, newer_filters, two_seqs] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.seed, args.output_suffix, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.newer_filters, args.two_seqs]

    #make a dictionary with RBPs as keys and ND/p values as values.
    if summary_file_name != "None":
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        #because some of the files are tab-separated, while others are comma-separated and have a header row
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")
            summary_data = summary_data[1:]

        summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True)

    #make a dictionary with RBPs as keys and lists of associated motifs as values        
    motifs = rw.read_motifs(motifs_file_name)

    #if you only want to be using a subset of the motifs
    if not full_set:
        #motifs with negative ND
        if negative_ND:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0]
        #the most significantly enriched motifs
        elif upper_quarter:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0.1]
        #the most significantly depleted motifs
        elif lower_quarter:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] > 0.9]
        #motifs with positive ND
        else:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] >= 0]

    #shove all the remaining motifs into a great big flattened and uniquified bag
    motifs = list(set(flatten(list(motifs.values()))))

    print(len(motifs))
    make_dir(output_folder_name)

    #if you want to average over families
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        families = rw.read_families(families_file_name)
        fs.add_families(families)
    else:
        fs = None

    #generate 100 1000 bp long random sequences based on the hg38 mononucleotide composition and use that as your sequence fasta
    if fasta_name == "random":
        names = [i for i in range(100)]
        seqs = nc.kmers_from_nc(1000, 100, genome_comp = True)
        fasta_name = "RBP/random_sequences_from_genome_comp.fasta"
        rw.write_to_fasta(names, seqs, fasta_name)

    with open(output_file_name, "w") as output_file:
        #generate n_sim sets of simulant motifs (constraining the space of simulants based on different sets of filters)
        if new_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed)
        elif newer_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed, concat = False, no_duplicates = True)
        else:
            current_simulants = nc.make_simulants(motifs, n_sim, seed = seed)
        #calculate the density parameters of the motifs in the sequence fasta
        output_dict = nc.get_sequence_set_density(fasta_name, None, motifs, simulants, n_sim,
                                                   "{0}/overall_density_{1}.csv".format(output_folder_name, output_suffix),
                                                   "{0}/overall_sim_density_{1}.csv".format(output_folder_name, output_suffix),
                                                   "{0}/overall_positions.csv_{1}".format(output_folder_name, output_suffix),
                                                   "{0}/overall_sim_positions_{1}".format(output_folder_name, output_suffix),
                                                   concat = False, positions = False, feature_set = fs, verbose = True, two_seqs = two_seqs)
        record = [str(output_dict["median density"]), str(np.mean(output_dict["simulated densities"])), str(output_dict["median ND"]), str(output_dict["effective p"]), str(output_dict["Z"]), str(output_dict["depletion p"]), str(len(motifs)), str(output_dict["simulant sd"])]
        #write to output file
        output_file.write("\t".join(record))
        print(record)
コード例 #15
0
ファイル: peak_caller.py プロジェクト: rosinaSav/dNETseq_code
def main():

    description = "Call peaks in a BED file of NET-seq reads."
    help_info = [
        "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).",
        "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!",
        "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.",
        "Name of the output file (BED file with peak coordinates).",
        "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.",
        "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.",
        "Minimum reads per peak. Default: 10.",
        "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.",
        "Minimum length of a peak in nucleotides. Default: 5.",
        "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21",
        "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.",
        "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.",
        "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.",
        "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.",
        "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).",
        "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)"
    ]
    defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1}
    args = hk.parse_arguments(description, [
        "reads_file", "gtf", "trans_active_file", "output_file",
        "significance_threshold", "merge", "min_reads_per_peak", "iterations",
        "min_peak_length", "window_size", "runs", "neg_control", "no_slide",
        "exclude_focal", "with_ups_intron", "no_PCR_filter"
    ],
                              floats=[4],
                              ints=[5, 6, 7, 8, 9, 10],
                              flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                              detailed_help=help_info,
                              defaults=defaults)
    reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter

    print("Merge distance: {0}".format(merge))
    print("Minimum number of reads per peak: {0}".format(min_reads_per_peak))
    print("Minimum peak length: {0}".format(min_peak_length))
    print("Window size: {0}".format(window_size))
    print("Significance level: {0}".format(significance_threshold))
    print("Randomization iterations to perform: {0}".format(iterations))
    print("Runs: {0}".format(runs))

    neg_str = ""
    if neg_control:
        neg_str = "_neg_control"

    slide_str = ""
    if no_slide:
        slide_str = "_no_slide"
    intron_str = ""
    if with_ups_intron:
        intron_str = "w_ups_intr"

    # 0. make a BED file with the coordinates of transcripts

    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)
    exons = rw.read_gtf(gtf, "exon")

    # 1. intersect the two files, loop over the result and make a
    # dictionary of reads per pos for each transcript, which has reads

    reads_per_pos = get_reads_per_pos(reads_file, transcripts_file)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    reads_per_pos = {
        i: reads_per_pos[i]
        for i in reads_per_pos if i.split(".")[-1] in trans_active_genes
    }

    for sim in range(runs):

        print("**********{0}**********".format(sim))

        # 2. for each transcript, randomly reshuffle the reads and calculate the
        # nth percentile depending on what the significance threshold is
        # keep positions that are higher than that threshold and write to BED file

        raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, min_reads_per_peak,
            window_size, neg_str, intron_str, slide_str, sim)
        read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, neg_str,
            intron_str, sim)
        new_reads_file = write_raw_peaks(reads_per_pos,
                                         raw_peak_bed,
                                         read_count_file,
                                         exons,
                                         iterations=iterations,
                                         min_read_count=min_reads_per_peak,
                                         window_size=window_size,
                                         neg_control=neg_control,
                                         no_slide=no_slide,
                                         exclude_focal=exclude_focal,
                                         with_ups_intron=with_ups_intron)
        if neg_control:
            reads_file = new_reads_file

        # 3. merge peaks

        merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str,
            slide_str, intron_str, sim)
        co.merge_bed(raw_peak_bed, merged_peak_bed, merge)
        print("Before filtering, there are {0} peaks.".format(
            hk.line_count(merged_peak_bed)))

        # 4. filter out peaks that don't have enough reads or are too short.
        # Write final results to file and also write a stats file with the size,
        # read count and overlapping transcript of the peaks

        stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim)
        filter_peaks(merged_peak_bed,
                     reads_file,
                     read_count_file,
                     "{0}_{1}_sim.bed".format(output_file[:-4], sim),
                     min_reads_per_peak,
                     min_peak_length,
                     stats_file,
                     no_PCR_filter=no_PCR_filter)
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(description="Calculate the conservation level of a series of RBP motifs.")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("genome", type = str, help = "genome assembly name")
    parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs")
    parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs")
    parser.add_argument("fasta_file_name", type = str, help = "name of fasta file with the sequences")
    parser.add_argument("families_file_name", type = str, help = "name of file that contains families")
    parser.add_argument("output_file_name", type = str, help = "file for output data")
    parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores")
    parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments")
    parser.add_argument("n_sim", type = int, help = "number of simulants")
    parser.add_argument("--valid_file", nargs = "?", const = "False")
    parser.add_argument("--gene_families", action = "store_true", help = "does the families file use gene identifiers?")
    parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?")
    parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?")
    parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters but without concatenation and without allowing duplicates within simulant sets.")
    parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should Goldman & Yang's method be used for calculating dS?")
    parser.add_argument("--baseml", dest = "baseml", action = "store_true", help = "Should baseml be used instead of codeml?")
    args = parser.parse_args()
    [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_name, families_file_name, output_file_name, output_folder_name, alignment_folder_name, n_sim, valid_file, gene_families, markov, new_filters, newer_filters, goldman_yang, baseml] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.valid_file, args.gene_families, args.markov, args.new_filters, args.newer_filters, args.goldman_yang, args.baseml]   

    #pick a random member from each paralogous family
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        families = rw.read_families(families_file_name)
        #if the families file uses gene identifiers rather than transcript identifiers
        if gene_families:
            families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        picked_trans = fs.pick_random_members()
        #if the fasta uses gene identifiers but the feature set uses transcript identifiers
        names = rw.read_fasta(fasta_file_name)[0]
        if picked_trans[0] not in names:
            transcripts = fs.get_transcripts()
            picked = []
            for i in picked_trans:
                picked.append(fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG"))
        else:
            picked = picked_trans
        print(len(picked))
    else:
        picked = None

    motif_dict = rw.read_motifs(RBP_file_name)

    #valid_file says which proteins pass information content criteria. Only analyze the ones that do.
    if not valid_file:
        validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t")
        validity = list_to_dict(validity, 0, 1)
    elif valid_file == "None":
        validity = {i: "True" for i in motif_dict}
    else:
        validity = rw.read_many_fields(valid_file, "\t")        
        validity = list_to_dict(validity, 0, 1)
    protein_names = sorted([name for name in list(motif_dict.keys()) if validity[name] == "True"])

    #whether to use PAML codeml or yn00.
    if baseml:
        method = "baseml"
    elif goldman_yang:
        method = "gy"
    else:
        method = "yn"

    #write the input data for the conservation analysis to file
    input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random())
    conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_file_name, input_dict_file_name, picked = picked)
    with open(output_file_name, "w") as file:
        file.write(",".join(["protein_name", "real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"]))
        file.write("\n")
        for protein in protein_names:
            print(protein)
            motifs = motif_dict[protein]
            #use one of several different methods to generate simulant motifs
            if markov:
                simulants = nc.make_simulants_markov(motifs, n_sim)
            elif new_filters:
                simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True)
            elif newer_filters:
                simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False, seed = 1)               
            else:
                simulants = nc.make_simulants(motifs, n_sim)
            sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, protein)
            #determine the conservation parameters of the current protein
            output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method)
            print(output_dict)
            print("\n")
            if output_dict != None:
                file.write(",".join([protein, str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))]))
            else:
                file.write(",".join([protein, str(None), str(None), str(None), str(None), str(None)]))
            file.write("\n")
    os.remove(input_dict_file_name)
コード例 #17
0
def main():

    description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14])
    [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name,  dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP]

    RBPs = rw.read_motifs(motifs_file_name)

    #only leave those RBPs hat pass information content criteria
    validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t")
    validity = list_to_dict(validity, 0, 1)
    RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"}

    #if you're not doing this by RBP, pool motifs from the most significantly depleted sets
    if not by_RBP:
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")    
        summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True)            
        RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9}
        motifs = list(set(flatten(list(RBPs.values()))))
        RBPs = {"all": motifs}

    #randomly pick one gene from each paralogous family
    fs = Feature_Set(features_file_name, genome)
    fs.set_dataset(dataset_name)
    transcripts = fs.get_transcripts()
    families = rw.read_families(families_file_name)
    families = fs.convert_families_to_ENST(families, transcripts)
    fs.add_families(families)
    picked_from_families = fs.pick_random_members()
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families]

    names, CDS = rw.read_fasta(sequences_file_name)

    #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species
    correspondances = rw.read_many_fields(correspondances_file_name, ",")
    correspondance_dict = {}
    for i in correspondances:
        correspondance_dict[i[0]] = i[1]

    output_dict = {}

    #loop over the RBPs
    for protein in sorted(RBPs):

        #fetch the current motifs
        print(protein)
        motifs = RBPs[protein]
        print("There are {0} motifs.".format(len(motifs)))
        #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set
        neighbours = nc.get_neighbours(motifs)
        print("There are {0} neighbours.".format(len(neighbours)))            

        #make simulants for the motifs. don't allow simulants to be part of the set of neighbours.
        simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False)

        neighbour_lengths = [len(i) for i in neighbours]        
        neighbours = nc.motif_to_regex(neighbours)

        #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that
        #would give rise to the motif in the orthologous species
        site_number = 0
        mutation_score = 0
        motifs = [list(i) for i in motifs]
        true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) 
        for i in true_result:
            current = i.get()
            site_number = site_number + current[0]
            mutation_score = mutation_score + current[1]
        if site_number > 0:
            real_fraction = mutation_score/site_number
        else:
            real_fraction = None
        print("Real fraction:")
        print(real_fraction)

        neighbours = ""      
        sim_site_numbers = np.zeros((n_sim))
        sim_mutation_scores = np.zeros((n_sim))

        #obtain this estimate also for each simulant set
        #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once
        for sim in range(n_sim):
            if sim%10 == 0:
                print(sim)
            current_simulants = simulants[sim]
            current_neighbours = nc.get_neighbours(current_simulants)
            current_neighbour_lengths = [len(i) for i in current_neighbours]        
            current_neighbours = nc.motif_to_regex(current_neighbours)
            current_simulants = [list(i) for i in current_simulants]
            current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif)
            for i in current_result:
                current = i.get()
                sim_site_numbers[sim] = sim_site_numbers[sim] + current[0]
                sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1]

        #normalize the real fraction, calculate p
        sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers)
        sim_fractions = [i for i in sim_fractions if i != np.inf]
        p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False)
        norm_fraction = ms.normalize(real_fraction, sim_fractions) 

        output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction]
        print(output_dict[protein])
        
    with open(output_file_name, "w") as output_file:
        #write header to output file
        output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n")
        #write the rest of the output data
        for protein in sorted(list(output_dict.keys())):
            to_write = output_dict[protein]
            to_write = [str(i) for i in to_write]
            output_file.write("\t".join(to_write))
            output_file.write("\n")
コード例 #18
0
def main():
        description = "Pick roughly nucleotide-matched control sites for a set of motif hits."
        args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "dataset", "motifs_file", "run_number", "hit_file", "niter", "stepsize", "control_file", "error_file", "MSA_file_name_prefix", "anc_CG_file_name", "high_CG_file_name", "exclude_file", "brute_mapping", "verbose", "old_motif_format", "nonsyn_hits", "top_set_only", "remove_GT", "leave_CG", "remove_ancestral_CpG", "replacement_control", "macaque_anc", "remove_macaque_CpG", "big_tree", "pseudoCG", "comprehensive", "context", "prone_sites", "CG_gene_filter", "match_size", "raw", "regions"], ints = [6, 8, 9], flags = [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])
        fasta, genome, features_file, families_file, dataset, motifs_file, run_number, hit_file, niter, stepsize, control_file, error_file, MSA_file_name_prefix, anc_CG_file_name, high_CG_file_name, exclude_file, brute_mapping, verbose, old_motif_format, nonsyn_hits, top_set_only, remove_GT, leave_CG, remove_ancestral_CpG, replacement_control, macaque_anc, remove_macaque_CpG, big_tree, pseudoCG, comprehensive, context, prone_sites, CG_gene_filter, match_size, raw, regions = args.fasta, args.genome, args.features_file, args.families_file, args.dataset, args.motifs_file, args.run_number, args.hit_file, args.niter, args.stepsize, args.control_file, args.error_file, args.MSA_file_name_prefix, args.anc_CG_file_name, args.high_CG_file_name, args.exclude_file, args.brute_mapping, args.verbose, args.old_motif_format, args.nonsyn_hits, args.top_set_only, args.remove_GT, args.leave_CG, args.remove_ancestral_CpG, args.replacement_control, args.macaque_anc, args.remove_macaque_CpG, args.big_tree, args.pseudoCG, args.comprehensive, args.context, args.prone_sites, args.CG_gene_filter, args.match_size, args.raw, args.regions

        #argparse can't do booleans
        if anc_CG_file_name == "None":
            anc_CG_file_name = None

        #I store motif data in one of two formats
        if old_motif_format:
            motifs = rw.read_names(motifs_file)[1:]
        else:
            motifs = rw.read_motifs(motifs_file)
            #if you're doing RBP motifs and only want motifs that were found to be enriched in Savisaar and Hurst 2017
            if top_set_only:
                summary_data = rw.read_many_fields("RBP/RBP_hg38_introncontaining_new.txt", "\t")

                summary_dict = list_to_dict(summary_data, 0, 4, floatify = True)

                motifs = {RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1)}
            motifs = list(set(flatten(list(motifs.values()))))

        #create an instance of a Feature_Set object and associate a structure of paralogous families to it, unless if you've said to ignore that (used when analyzing exon flanks/cores)
        fs = Feature_Set(features_file, genome)
        fs.set_dataset(dataset)
        if families_file == "None":
            conservation.find_families(fasta, "general/{0}".format(dataset))
            families_file = "general/{0}_families.txt".format(dataset)

        if families_file != "ignore":
            families = rw.read_families(families_file)
            fs.add_families(families)

        general_folder = "DFE/for_everybody"
        make_dir(general_folder)
        #if you've already retrieved MSAs from ensembl
        if MSA_file_name_prefix == "None":
            MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset)

        #admin
        transcripts = fs.get_transcripts()
        CDSs = fs.get_CDS()
        lengths = fs.get_lengths(CDSs, CDS = True)
        #only consider genes that are not on the sex chromosomes
        sex_chromosomes = ["X", "Y"]
        chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes}
        chroms = list(set(list(chrom_dict.values())))

        #U2S is a dinucleotide-based substitution model, JC69 is mononucleotide-based
        if context:
            subst_model = "U2S"
        else:
            subst_model = "JC69"

        #names used in the MSA (there's a character restriction in the phylip files so you can't use the full name)
        clean_names = ["h**o", "pan", "pongo", "macaca"]
        phylip_data = {"homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []}
        if big_tree:
            clean_names = ["calli", "chloro", "gorilla", "h**o", "macaca", "pan", "papio", "pongo"]
            phylip_data = {"gorilla_gorilla": [], "callithrix_jacchus": [], "papio_anubis": [], "chlorocebus_sabaeus": [], "homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []}

        if remove_ancestral_CpG or remove_macaque_CpG or CG_gene_filter:
            anc_CG_dict, macaque_CG_dict = get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = macaque_anc, pseudoCG = pseudoCG, comprehensive = comprehensive, subst_model = subst_model, regions = regions)
                                
        else:
            anc_CG_dict = None
            macaque_CG_dict = None
        
        if replacement_control:
            nc.fit_control_pos_to_hits_replacement(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size, raw = raw, exclude_file = exclude_file)
        else:
            nc.fit_control_pos_to_hits_wrapper(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size)
コード例 #19
0
def main():
    description = "Calculate the normalized dS of a dataset."
    args = parse_arguments(description, [
        "dataset", "feature_set", "genome", "families_file", "fasta",
        "hit_file_prefix", "motifs_file", "correspondances", "alignments",
        "suffix", "trials", "trial_file", "old_trial_file", "region_fasta",
        "old_motif_format", "nonsense", "no_families", "newest_only",
        "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen",
        "regions"
    ],
                           ints=[10],
                           flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
    dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions

    n_sim = 1000

    print(suffix)

    #set up feature set and families
    fs = Feature_Set(feature_set, genome)
    fs.set_dataset(dataset)
    if no_families:
        picked = fs.names
    else:
        families = rw.read_families(families_file)
        fs.add_families(families)
        picked = fs.pick_random_members()

    hit_phylip = "temp_data/temp_{0}.phy".format(random.random())
    control_phylip = "temp_data/temp_control_{0}.phy".format(random.random())

    if not nonsense:
        if old_motif_format:
            motifs = rw.read_names(motifs_file)[1:]
        else:
            motifs = rw.read_motifs(motifs_file)
            if top_set_only:
                summary_data = rw.read_many_fields(
                    "RBP/RBP_hg38_introncontaining_new.txt", "\t")
                summary_dict = list_to_dict(summary_data, 0, 4, floatify=True)
                motifs = {
                    RBP: motifs[RBP]
                    for RBP in motifs if (summary_dict[RBP] < 0.1)
                }
            motifs = list(set(flatten(motifs.values())))

    if reverse_site_numbers:
        site_number_suffix = "_reversed_site_numbers_"
    else:
        site_number_suffix = ""

    if matched:
        matched_suff = "_matched"
    else:
        matched_suff = ""

    if degen:
        degen_suff = "_degen.txt"
    else:
        degen_suff = ""

    with open(trial_file, "w") as trial_out:

        trial_out.write(
            "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n"
        )
        if old_trial_file != "None":
            old_trials = rw.read_many_fields(old_trial_file, "\t")
            old_trials = old_trials[1:]
            old_trials = [i[1:5] for i in old_trials]
            seed_kmers = 1
        else:
            seed_kmers = None

        #you can do this for loads of trials
        #useful as a negative control if you're generating a new set of nonsense motifs
        #each time
        for trial in range(trials):

            print(trial)

            trial_output = [trial]

            #if you're meant to generate a load of nonsense motifs rather than using real motifs
            if nonsense:
                if old_trial_file != "None":
                    #read in the intended nucleotide composition of the nonsense
                    #motifs from file
                    scaled_comp = [float(i) for i in old_trials[trial]]
                else:
                    #pick nonsense motifs nucleotide composition by chance
                    comp = [random.random() for i in range(4)]
                    scaled_comp = [i / np.sum(comp) for i in comp]
                comp_dict = {
                    i: scaled_comp[pos]
                    for pos, i in enumerate(nc._canon_bases_)
                }
                motifs, obtained_dict = nc.kmers_from_nc(6,
                                                         50,
                                                         comp_dict=comp_dict,
                                                         return_freqs=True,
                                                         seed=seed_kmers)
                motifs = ["motifs"] + motifs
                trial_output = trial_output + [
                    obtained_dict[i] for i in nc._canon_bases_
                ]
                temp_motifs_file = "temp_data/temp_motifs.txt"
                rw.write_names(motifs, temp_motifs_file)

            print(
                "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL==="
            )
            hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            if nonsense:
                hit_file = "temp_data/temp_hits{0}.txt".format(random.random())
                control_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                error_file = "temp_data/temp_error{0}.txt".format(
                    random.random())
                get_control_sites(
                    fasta, genome, feature_set, families_file, dataset,
                    temp_motifs_file, hit_file, control_file, error_file,
                    "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt",
                    [
                        "--leave_CG", "--context", "--remove_ancestral_CpG",
                        "--macaque_anc", "--big_tree", "--replacement_control"
                    ])
            get_density(fasta, motifs, fs)
            norm_ds = get_new_method_results(hit_file,
                                             control_file,
                                             hit_phylip,
                                             control_phylip,
                                             correspondances,
                                             alignments,
                                             fasta,
                                             regions=regions,
                                             global_fasta=region_fasta,
                                             fs=fs)
            trial_output.append(norm_ds)
            if calc_p:
                p, low_CI, high_CI, sd, Z = get_sim_p(
                    norm_ds,
                    hit_file,
                    control_file,
                    correspondances,
                    alignments,
                    fasta,
                    n_sim,
                    reverse_site_numbers=reverse_site_numbers,
                    sim_ds_file=
                    "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}"
                    .format(hit_file_prefix, site_number_suffix, matched_suff,
                            degen_suff))

            trial_output = "\t".join([str(i) for i in trial_output])
            trial_out.write(trial_output)
            trial_out.write("\n")

            remove_file(hit_phylip)
コード例 #20
0
def main():

    description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs."
    args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9])
    fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral

    names, seqs = rw.read_fasta(fasta)

    #I use two different formats for storing sequence motifs,
    #got to know which on it is
    if old_motif_format:
        motifs = rw.read_names(motif_file)[1:]
        print(len(motifs))
    else:
        motifs = rw.read_motifs(motif_file)
        motifs = sorted(list(set(flatten(list(motifs.values())))))

    #get the lengths of the motifs and compile lookahead regexes
    #that recognize the whole motif but only store the position of the first bases
    #these will be needed when searchin for the motifs
    motif_lengths = [len(i) for i in motifs]
    motif_regex = nc.motif_to_regex(motifs)

    #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say,
    #ESE motifs
    CG_2mers = ["CG", "GC"]
    CG_lengths = [2, 2]
    CG_regex = nc.motif_to_regex(CG_2mers)

    motifs = [list(i) for i in motifs]

    if ancestral:
        anc_pos = rw.read_pos(anc_file)

    #read in hit and control positions
    controls = rw.read_pos(control_file)
    hit_file = re.sub("controls", "hits", control_file)
    hits = rw.read_pos(hit_file)

    #read in SNP data
    SNPs = rw.read_many_fields(SNPs_file, "\t")
    #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data
    to_remove = list_to_dict(SNPs, 0, 2)
    to_remove = {i: to_remove[i].split(",") for i in to_remove}
    to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove}
    SNPs = list_to_dict(SNPs, 0, 1)

    #all the SNPs associated to a transcript
    full_SNPs = {}
    #disruptive SNPs only
    clean_SNPs = {}
    minor_alleles = {}

    #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions)
    transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}
    #the same as above but only counting those substitutions that would turn a motif into a non-motif
    transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}

    #this block of code filters the true SNPs to only leave those that are disruptive
    #and also calculates the probability of being disruptive for all potential SNPs
    with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file:
        counter = 0
        for trans in names:
            counter = update_counter(counter, 1000)
            if trans in controls:
                if trans in SNPs:
                    trans_SNPs = SNPs[trans]
                else:
                    trans_SNPs = []
                trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans)
                current_seq = seqs[names.index(trans)]
                fourfold_pos = nc.get_4fold_deg(current_seq)
                #CpG filtering
                if human:
                    CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"]
                    fourfold_pos = [i for i in fourfold_pos if i not in CG_pos]
                if ancestral:
                    fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]]
                all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove)
                hit_degen_file.write("\n")

    to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls}

    hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N)

    transitions = get_transitions(transitions_disr, transitions_total)
    print(transitions)

    #this block randomly assigns certain SNPs at simulant positions to be disruptive,
    #with the probability of that happening proportional to the frequency with which potential substitutions
    #of that nucleotide composition would be disruptive for true (motif) sites
    with open("{0}_degen.txt".format(control_file), "w") as control_degen_file:
        control_SNPs = {}
        counter = 0
        for trans in controls:
            control_degen_file.write("{0}\t".format(trans))
            counter = update_counter(counter, 1000)
            control_SNPs[trans] = {}
            trans_SNPs = full_SNPs[trans]
            current_seq = seqs[names.index(trans)]
            for site in controls[trans]:
                if trans not in to_remove or site not in to_remove[trans]:
                    ref_allele = current_seq[site]
                    disrupt_bases = get_disrupt_bases(ref_allele, transitions)
                    control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases)))
                    if site in trans_SNPs:
                        minor_allele = minor_alleles[trans][site]
                        if minor_allele in disrupt_bases:
                            control_SNPs[trans][site] = trans_SNPs[site]
            control_degen_file.write("\n")

    control_SFS = get_SFS(controls, control_SNPs, to_remove, N)

    with open(output_file, "w") as file:
        file.write("{0}\n".format(N))
        file.write(" ".join([str(i) for i in hit_SFS]))
        file.write("\n")
        file.write(" ".join([str(i) for i in control_SFS]))
        file.write("\n")    
コード例 #21
0
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False):
    '''
    Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC.
    '''
    #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine
    #CpG positions again, otherwise just read them in from the file
    if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name):
        #you need several in case you have a high_CG dictionary
        pps = []
        for phy_file in phy_files:
            if subst_model == "JC69" or from_model:
                #use an existing substitution model
                arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file]
            else:
                #estimate a new model
                arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file]
                
            if subst_model == "JC69":
                block_size = 4
                tuple_pos_lim = 2
                shift_in_tuple = 0
            else:
                #for dinucleotide models
                block_size = 16
                tuple_pos_lim = 3
                shift_in_tuple = 9

            #turn off when testing                        
            if min_inf:
                arguments.extend(["-I", min_inf])
            results = run_process(arguments)
            #read in posterior probabilities of having various nucelotides ancestrally
            pp_file = "{0}.postprob".format(outroot)
            pp = rw.read_many_fields(pp_file, " ")
            pp = [[j for j in i if j] for i in pp]
            pp = pp[2:]
            #the posterior probability that you had a CpG at a position has to be greater
            #than threshold for a position to be counted as ancestrally CpG
            threshold = 0.5
            #will be over-written if you're doing big tree
            human_pos = 0
            #the outgroup nodes are labelled from the outside in, starting from 1
            if macaque:
                #it's to know whether we're doing big tree or little tree
                if len(pp[0]) == 14:
                    #little tree, mononucleotide
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp}
                elif len(pp[0]) > 14:
                    #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree)
                    #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first
                    human_pos = 3 + shift_in_tuple
                    if comprehensive:
                        #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp}
                    else:
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp}
                else:
                    #for tests etc. where you might only have, say, two species
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            else:
                pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            pps.append(pp)
        anc_CG = {}
        #just to get the length
        example_pp = pps[0][list(pps[0].keys())[0]]
        for trans in tuples_mapping_dict:
            #tuples_mapping_dict has the alignment tuple corresponding to each position
            #because the phyloFit output is organized by tuples, not by positions
            anc_CG[trans] = []
            for node_pos in range(len(example_pp)):
                #if you're using dinucleotides
                if subst_model != "JC69":
                    for pos in sorted(tuples_mapping_dict[trans].keys())[1:]:
                        try:
                            pp_number = 0
                            #if you're gonna produce different output dictionaries for high and low GC regions
                            if high_CG:
                                if pos in high_CG[trans]:
                                    pp_number = 1
                            current_tuple = tuples_mapping_dict[trans][pos]
                            #don't consider positions where there is an alignment gap for human
                            if current_tuple[human_pos] != "*":
##                                print(current_tuple)
##                                print(pps[pp_number])
##                                print("\n")
                                if current_tuple in pps[pp_number]:
                                    current_pp = pps[pp_number][current_tuple][node_pos]
                                else:
                                    current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos]
                                #because it can be either GC or CG, hence 6 or 9
                                if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold:
                                    #you're always testing the second member in the dinucleotide
                                    anc_CG[trans].append(pos - 1)
                                    anc_CG[trans].append(pos)
                        except KeyError:
                            if pos % 100 == 0:
                                pass
                            else:
                                raise KeyError
                else:
                    #if you're using mononucleotides, you have to keep track of what the previous neuclotide was
                    C_prev = False
                    G_prev = False
                    for pos in sorted(tuples_mapping_dict[trans].keys()):
                        pp_number = 0
                        if high_CG:
                            if pos in high_CG[trans]:
                                pp_number = 1
                        current_C = False
                        current_G = False
                        current_tuple = tuples_mapping_dict[trans][pos]
                        if current_tuple[human_pos] != "*":
                            current_pp = pps[pp_number][current_tuple][node_pos]
                            #if current is C and previous was G
                            if float(current_pp[1]) > threshold:
                                if G_prev:
                                    anc_CG[trans].append(G_pos)
                                    anc_CG[trans].append(pos)
                                current_C = True
                            #if current is G and previous was C
                            if float(current_pp[2]) > threshold:
                                if C_prev:
                                    anc_CG[trans].append(C_pos)
                                    anc_CG[trans].append(pos)
                                current_G = True
                            C_prev = False
                            G_prev = False
                            if current_C:
                                C_prev = True
                                #you need to specify the position explicitly because it's not necessarily
                                #the last one if there were dashes
                                C_pos = pos
                            if current_G:
                                G_prev = True
                                G_pos = pos
            anc_CG[trans] = sorted(list(set(anc_CG[trans])))
        remove_file(pp_file)
        if anc_CG_file_name and anc_CG_file_name != "None":
            with open(anc_CG_file_name, "w") as file:
                for trans in anc_CG:
                    to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])])
                    file.write(to_write)
                    file.write("\n")
    else:
        #parse
        anc_CG = rw.read_many_fields(anc_CG_file_name, "\t")
        anc_CG = [i for i in anc_CG if len(i) == 2]
        anc_CG = list_to_dict(anc_CG, 0, 1)
        anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG}
    return(anc_CG)
コード例 #22
0
ファイル: RBP_motifs.py プロジェクト: rosinaSav/RBP_motifs
def main():
    '''
    Read in a series of input files on the sequence specificities of RBPs,
    filter the data and write a set of motifs for each RBP.
    Arguments (see Methods for further details on the input data files):
    upper_threshold, lower_threshold: the longest and shortest a motif is allowed to be, respectively
    RBPDB_experiments: path to RBPDB experiments file
    RBPDB proteins: path to RBPDB proteins file
    RBPDB_PWMs: path to file containing RBPDB PWM identifier to RBP mapping
    pwm_dir: path to directory containing RBPDB PWMs
    RBPmap_PSSMs: path to directory containing RBPmap PSSMs
    SFmap_proteins: path to file containing motifs from SFmap
    RNAcompete_information: path to summary file from CIS-BP RNA
    RNAcompete_PWMs: path to directory containing CIS-BP RNA PWMs
    final_motifs_file_name: name for output file
    plot_name: file for plot displaying the distribution of motif set sizes
    species: the species for which motifs are required
    '''

    description = "Compile a set of motifs putatively recognized by RNA-binding proteins."
    args = parse_arguments(description, ["upper_threshold", "lower_threshold", "RBPDB_experiments", "RBPDB_proteins", "RBPDB_PWMs", "pwm_dir", "RBPmap_PSSMs", "SFmap_proteins", "RNAcompete_information", "RNAcompete_PWMs", "final_motifs_file_name", "plot_name", "species"], ints = [0, 1])
    [upper_threshold, lower_threshold, RBPDB_experiments, RBPDB_proteins, RBPDB_PWMs, pwm_dir, RBPmap_PSSMs, SFmap_proteins, RNAcompete_information, RNAcompete_PWMs, final_motifs_file_name, plot_name, species] = [args.upper_threshold, args.lower_threshold, args.RBPDB_experiments, args.RBPDB_proteins, args.RBPDB_PWMs, args.pwm_dir, args.RBPmap_PSSMs, args.SFmap_proteins, args.RNAcompete_information, args.RNAcompete_PWMs, args.final_motifs_file_name, args.plot_name, args.species]

    db_fields = rw.read_many_fields(RBPDB_experiments, ",")
    db_fields = db_fields[1:]
    print("There are {0} RBPDB experiments.".format(len(db_fields)))
    db_proteins = rw.read_many_fields(RBPDB_proteins, ",")
    #species is "H**o sapiens" or "Mus musculus"
    db_proteins = [i for i in db_proteins if i[6] == species]
    protein_names = sorted(list(set([i[4] for i in db_proteins])))
    db_fields = [i for i in db_fields if i[3] in protein_names]
    protein_number_before = (len(list(set([i[3] for i in db_fields]))))
    print("{0} were performed in {1}.\n".format(len(db_fields), species))
    db_fields = [i for i in db_fields if i[2] != ""]
    protein_number_after = (len(list(set([i[3] for i in db_fields]))))
    db_fields = [[i[3], "RBPDB", i[0], i[1], i[2]] for i in db_fields]
    print("After removing experiments with no reported motif, {0} proteins remain of the initial {1}.\n".format(protein_number_after, protein_number_before))

    bases = np.array(["A", "C", "G", "U"])
    db_pwm_list = rw.read_many_fields(RBPDB_PWMs, "\t")

    for i in db_pwm_list:
        if i[1] in protein_names:
            current_file_name = "{0}/{1}.pwm".format(pwm_dir, i[0])
            current_PWM = rw.read_many_fields(current_file_name, delimiter = " ")
            for j in range(len(current_PWM)):
                current_PWM[j] = [float(k) for k in current_PWM[j] if k != ""]
            consensus = nc.consensus_from_PWM(current_PWM, bases, 0)
            PMID = i[0].split("_")
            PMID = PMID[1]
            new_record = [i[1], "RBPDB_PWM", PMID, "SELEX", consensus]
            db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding additional sequences from SELEX PWMs (RBPDB), there are {0} proteins.\n".format(protein_number_after))

    if species == "Mus musculus":
        RBPmap_proteins = rw.read_many_fields("RBP/RBPmap_proteins.csv", ",")
        RBPmap_proteins = list_to_dict(RBPmap_proteins, 0, 1)
        RNAc_source = [i for i in RBPmap_proteins if "23846655" in RBPmap_proteins[i]]
    else:
        RNAc_source = []

    for file_name in os.listdir(RBPmap_PSSMs):
        #RBPmap and SFmap don't distinguish between human and mouse motifs
        if "human" in file_name:
            file_name_split = file_name.split("_")
            protein_name = file_name_split[0]
            if protein_name not in RNAc_source:
                initial_pssm = rw.read_many_fields(os.path.join(RBPmap_PSSMs, file_name), delimiter = "\t")
                current_pssm = initial_pssm[1:]
                current_pssm = [i[1:] for i in current_pssm]
                for i in range(len(current_pssm)):
                    current_pssm[i] = [float(j) for j in current_pssm[i]]
                consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True)
                protein_name = list(protein_name)
                if protein_name[:4] == ["S", "R", "S", "F"]:
                    protein_name[:4] = ["S", "F", "R", "S"]
                protein_name = "".join(protein_name)
                new_record = [protein_name, "RBPmap_PWM", "NULL", "various", consensus]
                db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding additional sequences from RBPmap PSSMs, there are {0} proteins.\n".format(protein_number_after))

    SFmap_data = rw.read_many_fields(SFmap_proteins, delimiter = ",")

    for i in SFmap_data:
        if "," in i[1]:
            temp_split = i[1].split(", ")
            temp_split = [j.upper() for j in temp_split]
            i[1] = ";".join(temp_split)
        else:
            i[1] = i[1].upper()
        new_record = [i[0], "SFmap", "NULL", "various", i[1]]
        db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding motifs from SFmap, there are {0} proteins.\n".format(protein_number_after))

    RNAc = rw.read_many_fields(RNAcompete_information, delimiter = "\t")
    RNAc = [i for i in RNAc[1:] if i]
    if species == "H**o sapiens":
        RNAc = [i for i in RNAc if i[3] != "." and i[8] == "D"]
    if species == "Mus musculus":
        RNAc = [i for i in RNAc if i[3] != "."]

    PSSM_folder = RNAcompete_PWMs
    for record in RNAc:
        motif_name = record[3]
        initial_pssm = rw.read_many_fields(os.path.join(PSSM_folder, "{0}.txt".format(motif_name)), delimiter = "\t")
        if initial_pssm == []:
            if record[19] == "21036867":#RBPDB paper
                pass
            else:
                print(record)
        else:    
            current_pssm = initial_pssm[1:]
            current_pssm = [i[1:] for i in current_pssm]
            for i in range(len(current_pssm)):
                current_pssm[i] = [float(j) for j in current_pssm[i]]
            consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True)
            protein_name = record[6]
            new_record = [protein_name, "CIS-BP_RNA_PWM", record[19], record[14], consensus] 
            db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding motifs from CIS-BP RNA, there are {0} proteins.\n".format(protein_number_after))

    to_delete = []
    for pos, i in enumerate(db_fields):
        if ";" in i[4]:
            if "; " in i[4]:
                temp_split = i[4].split("; ")
            else:
                temp_split = i[4].split(";")
            temp_split = [((j.upper()).lstrip("N")).rstrip("N") for j in temp_split]
            temp_split = [j for j in temp_split if len(j) <= upper_threshold and len(j) >= lower_threshold and "(" not in j]
            if temp_split:
                db_fields[pos][4] = temp_split[0]
                for j in temp_split[1:]:
                    db_fields.append([i[0], i[1], i[2], i[3], j])
            else:
                to_delete.append(pos)
        else:
            i[4] = (((i[4]).upper()).rstrip("N")).lstrip("N")
            if len(i[4]) > upper_threshold or len(i[4]) < lower_threshold or "(" in i[4]:
                to_delete.append(pos)
            else:
                db_fields[pos][4] = i[4]

    db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete]

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After only keeping motifs of length {0}-{1} bp, {2} proteins remain.\n".format(lower_threshold, upper_threshold, protein_number_after))

    protein_names = list(set([i[0] for i in db_fields]))

    if species == "Mus musculus":
        protein_names_file = "RBP/RBP_names_for_checking.txt"
        with open(protein_names_file, "w") as file:
            for name in protein_names:
                file.write("{0}\n".format(name))
        MGI_file = "RBP/MGI_correspondances.txt"
        MGI = rw.read_many_fields(MGI_file, "\t")
        MGI_names_all = [i[0] for i in MGI[1:]]
        found = [i[0] for i in MGI if i[0] == i[3]]
        MGI = {i[0]: i[3] for i in MGI[1:] if i[0] not in found}

    to_delete = []
    for pos, i in enumerate(db_fields):
        if species == "Mus musculus":
            db_fields[pos][0] = "".join([db_fields[pos][0][0].upper(), db_fields[pos][0][1:].lower()])
            #will get rid of Hnrnpcl1, which didn't return anything in the MGI search.
            if db_fields[pos][0] not in MGI_names_all:
                to_delete.append(pos)
            else:
                if db_fields[pos][0] not in found:
                    db_fields[pos][0] = MGI[db_fields[pos][0]]
        elif species == "H**o sapiens":
            if i[0] == "A2BP1" or i[0] == "FOX1":
                db_fields[pos][0] = "RBFOX1"
            elif i[0] == "SFRS13A":
                db_fields[pos][0] = "SRSF10"
            elif i[0][:6] == "BRUNOL":
                db_fields[pos][0] = "CELF{0}".format(i[0][-1])
            elif i[0] == "CUGBP":
                db_fields[pos][0] = "CELF1"
            elif i[0] == "Fusip1":
                db_fields[pos][0] = "SRSF10"
            elif i[0][:4] == "SFRS":
                db_fields[pos][0] = "SRSF{0}".format(i[0][4:])
            elif i[0] == "HuR":
                db_fields[pos][0] = "ELAVL1"
            elif i[0] == "MBNL":
                db_fields[pos][0] = "MBNL1"
            elif i[0] == "PTB":
                db_fields[pos][0] = "PTBP1"
            elif i[0] == "QK1":
                db_fields[pos][0] = "QKI"
            elif i[0] == "RBM9":
                db_fields[pos][0] = "RBFOX2"
            elif i[0] == "STAR-PAP":
                db_fields[pos][0] = "TUT1"
            elif i[0] == "YB-1":
                db_fields[pos][0] = "YBX1"
            elif i[0] == "hnRNPK":
                db_fields[pos][0] = "HNRNPK"
            elif i[0] == "hnRNPLL" or i[0] == "HNRPLL":
                db_fields[pos][0] = "HNRNPLL"

    db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete]

    protein_names = list(set([i[0] for i in db_fields]))

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After cleaning up protein IDs, {0} proteins remain.\n".format(protein_number_after))
            
    protein_dict = {}
    for i in db_fields:
        if i[0] not in protein_dict.keys():
            protein_dict[i[0]] = [i]
        else:
            protein_dict[i[0]].append(i)

    if species == "H**o sapeins":
        del protein_dict["PPIE"]
        del protein_dict["MIR1236"]
        del protein_dict["PABPC4"]
        print("After removing PPIE, PABPC4 and MIR1236, {0} proteins remain.\n".format(len(protein_dict)))
    elif species == "Mus musculus":
        del protein_dict["Pabpc4"]
        print("After removing Pabpc4, {0} proteins remain.\n".format(len(protein_dict)))

    for i in protein_dict:
        if i == "ELAVL1":
            protein_dict[i].append(['ELAVL1', 'synthetic', 'synthetic', 'synthetic', 'UUWGDUU'])
        elif i == "ELAVL2":
            protein_dict[i].append(['ELAVL2', 'synthetic', 'synthetic', 'synthetic', 'RWUUYAUUUWR'])
        protein_dict[i] = sorted(protein_dict[i], key = lambda x:x[4])
        current_motifs = [j[4] for j in protein_dict[i]]
        to_delete = []
        for j in range(1, len(current_motifs)):
            if current_motifs[j] == current_motifs[j-1]:
                for k in range(1, 4):
                    protein_dict[i][j][k] = ",".join([protein_dict[i][j][k], protein_dict[i][j - 1][k]])
                to_delete.append(j - 1)
        protein_dict[i] = [protein_dict[i][j] for j in range(len(protein_dict[i])) if j not in to_delete]

    for i in protein_dict:
        protein_dict[i] = [[j[0], j[4], j[1], j[2], j[3]] for j in protein_dict[i]] 

    print("\n")
    print("Writing motifs to {0}.\n".format(final_motifs_file_name))

    motif_numbers = []
    with open(final_motifs_file_name, "w") as final_motifs_file:
        for i in sorted(list(protein_dict.keys())):
            final_motifs_file.write(">{0}\n".format(i))
            current_motifs = [j[1] for j in protein_dict[i]]
            DNA_motifs = [nc.DNA_RNA_conversion(j) for j in current_motifs]
            unravelled_motifs = [nc.unravel_consensus(j) for j in DNA_motifs]
            unravelled_motifs = flatten(unravelled_motifs)
            unravelled_motifs = list(set(unravelled_motifs))
            print("Writing {0} motifs for {1}.".format(len(unravelled_motifs), i))
            motif_numbers.append(len(unravelled_motifs))
            unravelled_motifs = "|".join(unravelled_motifs)
            final_motifs_file.write("{0}\n".format(unravelled_motifs))

    plt.figure(1)
    plotting.histogram(motif_numbers, 50, x_lab = "Motif number", y_lab = "Frequency", title = None)
    plotting.save_and_show([10, 10], 100, plot_name)
コード例 #23
0
ファイル: mnase_bias.py プロジェクト: rosinaSav/dNETseq_code
def main():
    description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \
                  "as the true set."
    args = hk.parse_arguments(description, [
        "active_genes_file", "gtf", "PolII_file", "fasta", "outfile",
        "chrom_sizes"
    ])
    active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes

    chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t")
    chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True)

    # get transcriptionally active genes and make a BED file with their coordinates
    print("Getting the coordinates of transcriptionally active genes...")
    trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:]
    trans_active_genes = [i[3] for i in trans_active_genes]
    transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)

    transcripts_dict = {}
    # this will be used for getting the k-mers in the transcripts
    filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format(
        transcripts_file[:-4])
    # this will be used for filtering the reads
    filtered_transcripts_file = "{0}_trans_act_only.bed".format(
        transcripts_file[:-4])
    with open(filtered_transcripts_file,
              "w") as ft_file, open(transcripts_file) as t_file, open(
                  filtered_transcripts_file_plus2, "w") as ft_file2:
        reader = csv.reader(t_file, delimiter="\t")
        writer = csv.writer(ft_file, delimiter="\t")
        writer2 = csv.writer(ft_file2, delimiter="\t")
        for line in reader:
            if line[3] in trans_active_genes:
                # if line[0][0] not in ["G", "K"]:
                #     line[0] = "chr{0}".format(line[0])
                writer.writerow(line)
                # this is because if a read falls at the first position, you will need to know the
                # preceding two bases. Same if it falls at the last position.
                line[1] = str((int(line[1])) - 3)
                line[2] = str((int(line[2])) + 3)
                writer2.writerow(line)
                transcripts_dict[line[3]] = line

    print("Filtering reads to the transcripts...")
    # filter reads to only ones that overlap these transcripts
    transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4])
    co.intersect_bed(PolII_file,
                     filtered_transcripts_file,
                     force_strand=True,
                     output_file=transcripts_PolII)

    print("Extracting FASTA from the transcript coordinates...")
    # the genome FASTA is formatted as N rather than chrN
    filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format(
        transcripts_file[:-4])
    hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2],
                   file_for_output=filtered_transcripts_file_no_chr)
    filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format(
        transcripts_file[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed",
        filtered_transcripts_file_no_chr, "-fo",
        filtered_transcripts_fasta_no_chr, "-s", "-name"
    ])

    print("Mapping kmers to transcript positions...")
    kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr,
                                       k=6,
                                       focal_pos=3)

    print("Extracting the starting dinucleotide for each read...")
    starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format(
        PolII_file[:-4])
    starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format(
        PolII_file[:-4])
    co.extend_intervals(transcripts_PolII,
                        starting_dints_PolII,
                        3,
                        3,
                        remove_chr=True)
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII,
        "-fo", starting_dints_PolII_fasta, "-s"
    ])

    print("Picking random control positions...")
    pick_random_positions(transcripts_PolII,
                          starting_dints_PolII_fasta,
                          outfile,
                          kmer_dict,
                          transcripts_dict,
                          chrom_sizes=chrom_sizes)

    print("Making single nucleotide resolution file...")
    snr_file = "{0}_snr.bed".format(outfile[:-4])
    co.snr_bed(outfile, snr_file)

    print(
        "Removing reads that overlap potential splice intermediate positions..."
    )
    no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4])
    co.intersect_bed(snr_file,
                     "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf",
                     force_strand=True,
                     exclude=True,
                     no_dups=False)
コード例 #24
0
def main():
    description = "Prepare input file for running MultiDFEest."
    args = parse_arguments(description, [
        "hit_file", "control_file", "SNPs_file_prefix", "N", "output_file",
        "per_chrom_files", "shuffle"
    ],
                           ints=[3],
                           flags=[5, 6])
    hit_file, control_file, SNPs_file_prefix, N, output_file, per_chrom_files, shuffle = args.hit_file, args.control_file, args.SNPs_file_prefix, args.N, args.output_file, args.per_chrom_files, args.shuffle

    hits = parse_pos(hit_file)
    controls = parse_pos(control_file)

    if shuffle:
        hits, controls = shuffle_dictionaries(hits, controls)

    SNPs = {}
    to_remove_all = {}
    #if the data is stored chromosome by chromosome, rather than all combined
    if per_chrom_files:
        for chrom in range(1, 23):
            try:
                SNPs_file = "{0}{1}.bed".format(SNPs_file_prefix, str(chrom))
                current_SNPs = rw.read_many_fields(SNPs_file, "\t")
                to_remove = list_to_dict(current_SNPs, 0, 2)
                to_remove = {i: to_remove[i].split(",") for i in to_remove}
                current_SNPs = list_to_dict(current_SNPs, 0, 1)
                for trans in current_SNPs:
                    if trans in controls:
                        SNPs[trans] = {}
                        trans_SNPs = current_SNPs[trans]
                        if trans_SNPs:
                            trans_SNPs = [
                                i.split(",") for i in trans_SNPs.split("|")
                            ]
                            #this is where you get the allele count
                            trans_SNPs = list_to_dict(trans_SNPs, 0, 3)
                            trans_SNPs = {
                                int(i): int(trans_SNPs[i])
                                for i in trans_SNPs
                            }
                            SNPs[trans] = trans_SNPs
                        to_remove_all[trans] = [
                            int(i) for i in to_remove[trans]
                            if i not in ["error", ""]
                        ]
            except FileNotFoundError:
                pass
    else:
        SNPs_file = SNPs_file_prefix
        current_SNPs = rw.read_many_fields(SNPs_file, "\t")
        to_remove = list_to_dict(current_SNPs, 0, 2)
        to_remove = {i: to_remove[i].split(",") for i in to_remove}
        current_SNPs = list_to_dict(current_SNPs, 0, 1)
        counter = 0
        for trans in current_SNPs:
            if trans in controls:
                SNPs[trans] = {}
                trans_SNPs = current_SNPs[trans]
                if trans_SNPs:
                    trans_SNPs = [i.split(",") for i in trans_SNPs.split("|")]
                    #this is where you get the allele count
                    trans_SNPs = list_to_dict(trans_SNPs, 0, 3)
                    trans_SNPs = {
                        int(i): int(trans_SNPs[i])
                        for i in trans_SNPs
                    }
                    SNPs[trans] = trans_SNPs
                to_remove_all[trans] = [
                    int(i) for i in to_remove[trans] if i not in ["error", ""]
                ]

    hit_SFS = get_SFS(hits, SNPs, to_remove_all, N)
    control_SFS = get_SFS(controls, SNPs, to_remove_all, N)

    with open(output_file, "w") as file:
        file.write("{0}\n".format(N))
        file.write(" ".join([str(i) for i in hit_SFS]))
        file.write("\n")
        file.write(" ".join([str(i) for i in control_SFS]))
        file.write("\n")
コード例 #25
0
def main():
    description = "Aggregate various statistics on the splicing events you're studying."
    args = hk.parse_arguments(description, [
        "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file",
        "genome_file", "output_file"
    ])
    gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file

    CDSs = rw.read_gtf(gtf, "CDS", gene=False)
    exons = rw.read_gtf(gtf, "exon", gene=False)
    exon_starts = rw.read_many_fields(exon_start_coords,
                                      skip_header=False,
                                      delimiter="\t")
    exon_starts = {i[3]: i for i in exon_starts}
    out_array = np.array(sorted(exon_starts.keys()), dtype="str")
    out_array.shape = (len(exon_starts.keys()), 1)
    out_array = np.vstack((["junction"], out_array))

    #1. exon size
    curr_dict = co.get_lengths(CDSs, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_size")
    print("Exon size done.")

    #2. exon number
    curr_dict = co.get_exon_number(exons, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_number")
    print("Exon number done.")

    #3. exon rank (from start and end)
    exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts)
    out_array = add_to_array(out_array, exon_rank_start,
                             "exon_rank_from_start")
    out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end")
    print("Exon rank done.")

    #4. upstream intron size
    curr_dict = co.get_upstream_intron_size(exons, exon_rank_start)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_size")
    curr_dict = co.get_upstream_intron_size(exons,
                                            exon_rank_start,
                                            downstream=True)
    out_array = add_to_array(out_array, curr_dict, "downstream_intron_size")
    print("Intron size done.")

    if truncated_exons_file != "None":

        #5. Pol II density per transcript
        dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4])
        dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file,
                                                      polII_bed,
                                                      dens_per_trans_file,
                                                      out_array[1:, 0])
        out_array = add_to_array(out_array, dens_per_trans_junctions,
                                 "polII_dens_per_trans")
        print("Pol II density done.")

    #6. exon GC4 and GC content
    genome = Fasta(genome_file)
    curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC4")
    curr_dict = get_exon_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC")
    print("Exon GC done.")

    #7. upstream intron GC content
    curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC")
    print("Intron GC done.")

    #8. splice site strength
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=False,
                                   exonic=3,
                                   intronic=20)
    out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=False,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength")
    print("Splice site strength done.")

    with open(output_file, "w") as file:
        for line in range(0, out_array.shape[0]):
            line = out_array[line, :]
            line = "\t".join([str(i) for i in line])
            file.write(line)
            file.write("\n")
コード例 #26
0
def main():  
    parser = argparse.ArgumentParser(description="Prepare a clean dataset of protein-coding genes.")
    parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features")
    parser.add_argument("ortholog_features_file_name", type = str, help = "name of GTF file with genome features for the orthologous genome")
    parser.add_argument("genome", type = str, help = "genome assembly name")
    parser.add_argument("ortholog_genome", type = str, help = "ortholog genome assembly name")
    parser.add_argument("dataset_name", type = str, help = "dataset name")
    parser.add_argument("ortholog_dataset_name", type = str, help = "ortholog dataset name")
    parser.add_argument("orthologs_file_name", type = str, help = "csv with orthologous pairs")
    parser.add_argument("dS_threshold", type = float, help = "csv with orthologus pair")
    parser.add_argument("alignment_folder", type = str, help = "folder where phy alignment files will be stored")
    parser.add_argument("raw_orth_seq_file", type = str, help = "file with the raw ortholog CDS sequences (downloaded via ensembl biomart)")

    args = parser.parse_args()
    [features_file_name, ortholog_features_file_name, genome, ortholog_genome, dataset_name, ortholog_dataset_name, orthologs_file_name, dS_threshold, alignment_folder, raw_orth_seq_file] = [args.features_file_name,
                                                                                                                                                          args.ortholog_features_file_name, args.genome, args.ortholog_genome,
                                                                                                                                                                                               args.dataset_name,
                                                                                                                                                                                               args.ortholog_dataset_name,
                                                                                                                                                                                               args.orthologs_file_name,
                                                                                                                                                                                               args.dS_threshold,
                                                                                                                                                                                               args.alignment_folder,
                                                                                                                                                                                               args.raw_orth_seq_file]
    make_dir(alignment_folder)
    trans_id_pattern = re.compile("ENS\w*T\d*")
    ids_to_keep = []
    #loop over an ensembl GTF file
    with open(features_file_name) as features_file:
        #skip the metadata
        for i in range(5):
            features_file.readline()
            
        for i in features_file:
            #only consider features that have been localized to chromosomes and that are from protein-coding genes
            if "PATCH" not in i and "gene_biotype \"protein_coding\"" in i and i[0] in "123456789XY" and i[1] in "0123456789XY\t":
                trans_id_obj = re.search(trans_id_pattern, i)
                if trans_id_obj:
                    trans_id = trans_id_obj.group(0)
                    #store the transcript ID
                    ids_to_keep.append(trans_id)

    #make a list of the unique transcript IDs you got in the previous step
    ids_to_keep = list(set(ids_to_keep))

    #create a feature set object from the transcript IDs,
    #that is to say, make a file that has all the associated gene feature annotations
    fs = Feature_Set(features_file_name, genome)
    #the dataset only needs to be created if it didn't exist previously
##    fs.create_dataset(dataset_name, input_list = ids_to_keep)
    fs.set_dataset(dataset_name)
    print("Created dataset with {0} transcripts.".format(len(fs.names)))
    #this file will have the mappings between genes from the focal species and genes from the orthologus species
    final_pairs_file_name = "general/{0}_{1}_pc_pairs.csv".format(genome, ortholog_genome)

    CDS = fs.get_CDS()
    CDS = {i: CDS[i] for i in CDS if CDS[i]}
    #write the full ORF sequences of the genes to FASTA, filtering based on reading frame integrity. Also check that
    #there are no premature termination codons.
    fs.write_full_CDS(CDS, check_ORF = True, bare_name = True, PTC_check = True)

    ids_to_keep = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))[0]

    print("{0} transcripts pass the check for ORF integrity.".format(len(ids_to_keep)))

    transcripts = fs.get_transcripts()
    transcripts = {i: transcripts[i] for i in ids_to_keep}

    #for genes with several associated transcript IDs, only keep the longest.   
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    ids_to_keep = []
    for gene in gene_name_dict:
        current_CDS = [CDS[j] for j in gene_name_dict[gene]]
        current_lengths = [sum([j[0][3] - j[0][2] + 1 for j in k]) for k in current_CDS]
        id_to_keep = gene_name_dict[gene][current_lengths.index(max(current_lengths))]
        ids_to_keep.append(id_to_keep)

    print("After only keeping one transcript per gene (the longest), {0} transcripts remain.".format(len(ids_to_keep)))

    #this is a file that has the orthologs of your gens from Ensmebl biomart
    orth_data = rw.read_many_fields(orthologs_file_name, ",")
    #make a dictionary for the gene-to-ortholog mapping
    pairs_dict = {}

    for line in orth_data:
        if line[1] not in pairs_dict:
            pairs_dict[line[1]] = []
        pairs_dict[line[1]].append(line[2])

    #only keep genes for which there is an ortholog in the comparator species
    #transcript identifiers
    ids_to_keep = [i for i in ids_to_keep if i in pairs_dict]

    #gene identifiers
    orth_ids_to_keep = list(pairs_dict.values())
    orth_ids_to_keep = list(set(flatten(orth_ids_to_keep)))

    #create a feature set for the other species based on the genes that are orthologous to the genes in your focal set
    orth_fs = Feature_Set(ortholog_features_file_name, ortholog_genome)
##    orth_fs.create_dataset(ortholog_dataset_name, input_list = orth_ids_to_keep, input_type = "gene")
    orth_fs.set_dataset(ortholog_dataset_name)
    orth_CDS = orth_fs.get_CDS()
    orth_CDS = {i: orth_CDS[i] for i in orth_CDS if orth_CDS[i]}
    #write the ortholog ORFs to FASTA. Filter based on reading frame integrity and PTC content.
    orth_fs.write_full_CDS(orth_CDS, check_ORF = True, bare_name = True, PTC_check = True)
    orth_full_CDS_file = "{0}_{1}_full_CDS.fasta".format(ortholog_features_file_name[:-4], ortholog_dataset_name)

    #in some cases, if the genome assembly for the ortholog is not very good, it can take forever to get the sequences using faidx.
    #In that case, you can get the sequences via biomart. Uncomment the code below!
##    rw.write_names(list(orth_CDS.keys()), "general/{0}_trans_IDs.txt".format(ortholog_dataset_name))
##    with open(raw_orth_seq_file) as file:
##        raw_orth_seq = "".join(file)
##    raw_orth_seq = re.sub("([A-Z])\n([A-Z])", "\\1\\2", raw_orth_seq)
##    raw_orth_seq = raw_orth_seq.split("\n")
##    raw_orth_seq = [i for i in raw_orth_seq if len(i) > 0]
##    raw_orth_names = [i for i in raw_orth_seq if i[0] == ">"]
##    raw_orth_seq = [i for i in raw_orth_seq if i[0] != ">"]

##    with open(orth_full_CDS_file, "w") as file:
##        for pos, seq in enumerate(raw_orth_seq):
##            ORF_check = check_ORF_integrity(seq, PTC_check = True)
##            if ORF_check[0]:
##                file.write("{0}\n".format(raw_orth_names[pos]))
##                file.write("{0}\n".format(seq))
##            else:
##                print(pos)
##                print(ORF_check[1])
##                print(raw_orth_names[pos])
##                print(seq)
##                print("\n")            

    #read in the full ORF sequences from both species
    CDS_names, CDS_seq = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))
    orth_CDS_names, orth_CDS_seq = rw.read_fasta(orth_full_CDS_file)

    orth_transcripts = orth_fs.get_transcripts()

    orth_gene_name_dict = orth_fs.get_gene_name_dict(orth_transcripts)

    final_pairs = {}
    counter = 0
    #loop over the remaining genes
    for i in ids_to_keep:
        if counter%1000 == 0:
            print(counter)
        counter = counter + 1
        #get the IDs of the orthologous genes in the ortholog species
        orth_ids = pairs_dict[i]
        #get all the associated transcript identifiers
        orth_ids_trans = [[orth_gene_name_dict[j][k] for k in range(len(orth_gene_name_dict[j]))] for j in orth_ids if j in orth_gene_name_dict]
        orth_ids_trans = flatten(orth_ids_trans)
        CDS = CDS_seq[CDS_names.index(i)]
        orth_CDS = []
        ids_to_remove = []
        #get all the ortholog ORF sequences
        for j in orth_ids_trans:
            try:
                current_CDS = orth_CDS_seq[orth_CDS_names.index(j)]
                orth_CDS.append(current_CDS)
            #this is because some of the transcripts produced from the gene might be non-coding or have a wonky ORF and therefore not appear in the CDS fasta
            except ValueError:
                ids_to_remove.append(j)
        orth_ids_trans = [j for j in orth_ids_trans if j not in ids_to_remove]
        #check that the sequence from the focal species aligns to an ortholog with dN/dS below 0.5 and dS below the specified threshold
        if orth_ids_trans:
            conservation_check = keep_conserved_pc(i, orth_ids_trans, CDS, orth_CDS, dS_threshold, alignment_folder)
            if conservation_check[0]:
                #also store which ortholog transcript gave the lowest dS in the alignment
                final_pairs[i] = conservation_check[1]
            
    print("After filtering by conservation, {0} transcripts remain.".format(len(list(final_pairs.values()))))
    #write the final retained ortholog gene pairs to file
    with open(final_pairs_file_name, "w") as file:
        output_writer = csv.writer(file, delimiter = ",")
        for i in final_pairs:
            output_writer.writerow([i, final_pairs[i]])

    print("Wrote ortholog pairs to {0}.".format(final_pairs_file_name))

    #write the remaining ORF sequences to fasta
    CDS_seq = [i for pos, i in enumerate(CDS_seq) if CDS_names[pos] in final_pairs]
    CDS_names = [i for i in CDS_names if i in final_pairs]
    rw.write_to_fasta(CDS_names, CDS_seq, "general/filtered_{0}_wo_low_omega.fasta".format(dataset_name))

    #create a feature set with the remaining genes
    filtered_fs = Feature_Set(features_file_name, genome)
    filtered_fs.create_dataset("filtered_{0}".format(dataset_name), input_list = list(final_pairs.keys()))
    print("All done.")
コード例 #27
0
def main():
    description = "Record splicing distance."
    args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7])
    input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal

    if outsuffix == "None":
        outsuffix = ""

    bare_input_path = input_file.split("/")[-1]
    bed = "{0}.bed".format(input_file[:-4])
    # hk.convert2bed(input_file, bed)

    # get descriptive stats of the reads
    length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4])
    write_read_lengths(bed, length_file)

    # read in CDS coordinates
    exons = rw.read_gtf(gtf, "CDS", gene=False)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    exons = {i: exons[i] for i in exons if i in trans_active_genes}
    terminal_suff = "_with_terminal"
    if not leave_terminal:
        # remove last exons
        exons = {i: exons[i][:-1] for i in exons}
        terminal_suff = ""
    # prepare exon-exon junctions
    exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff)
    all_junctions = co.extract_3ss(exons, exon_junctions_file)

    out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True)
    out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True)
    intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff)
    write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True)
    out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True)
    out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True)
    out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si, exons, add_chr=True)
    out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True)
    # check which junctions are associated with a splicing intermediate read
    snr_bed = "{0}_snr.bed".format(bed[:-4])
    co.snr_bed(bed, snr_bed)
    si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed)
    si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed)

    # filter out reads that don't overlap exon-exon junctions
    exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    co.intersect_bed(bed, exon_junctions_file, write_both=True,
                     output_file=exon_junction_bed,
                  force_strand=True, no_dups=False)

    spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    sr_distances = {}
    ur_distances = {}
    found_count = 0
    file_size = hk.line_count(exon_junction_bed)

    # will store all the intron names for which there are
    # either spliced or unspliced reads
    valid_junctions = []
    with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile:
        for pos, line in enumerate(file):

            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, file_size))
                print("Found {0} spliced reads.".format(found_count))
                print("\n")

            line = line.split("\t")

            # reads that end at the last nucleotide of an exon
            intermediate_read = NGS.check_intermediate_read(line, exons)
            intron_name = line[20]

            if not intermediate_read:

                # check that it ends within the exon just downstream of
                # the 3' ss that is being analyzed

                in_dwns_exon = NGS.check_position_in_exon(line, exons)

                if in_dwns_exon:

                    # 'spliced', 'unspliced' or 'None' (=can't analyze)
                    read_type = NGS.analyze_cigar(line, overhang = 5)

                    if read_type:
                        if intron_name not in valid_junctions:
                            valid_junctions.append(intron_name)
                        splice_dist = NGS.get_splice_dist(line)
                        if read_type == "S":
                            sfile.write("\t".join([str(i) for i in line]))
                            found_count = found_count + 1
                            sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist)
                        else:
                            ufile.write("\t".join([str(i) for i in line]))
                            ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist)

    print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1)))

    # for each valid junction, calculate the length of the exonic sequence
    # afterwards, so that you wouldn't consider intronic sequence in the distance
    # matrix
    lengths_dict = co.get_lengths(exons, valid_junctions)

    write_dist_mat(sr_distances, window_size,
                   "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))

    write_dist_mat(ur_distances, window_size,
                   "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
コード例 #28
0
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None):
    '''
    Wraps call to multiDFEest.
    '''
    flags = []

    if fold_SFS:
        fold_SFS = 1
    else:
        fold_SFS = 0
    #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size
    if pop_change:
        pop_change = 0
    else:
        pop_change = 1

    #convert the English distribution names into multiDFEest model codes
    if model == "lognormal":
        model_code = 4
        #parameter number for calculating AIC
        par_number = 2
    elif model == "gamma":
        model_code = 2
        par_number = 2
    elif model == "beta":
        model_code = 3
        par_number = 2
    elif model == "spikes":
        model_code = 0
        if not n_spikes:
            print("To be able to use a spikes model, you need to specify the number of spikes.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "steps":
        model_code = 1
        if not n_spikes:
            print("To be able to use a steps model, you need to specify the number of steps.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "six_spikes":
        model_code = 5
        par_number = 5
        flags = ["-ranrep", repetitions]
    else:
        print("{0} is not a valid model name!".format(model))
        raise Exception

    input_file_short = input_file.split("/")
    input_file_short = input_file_short[-1]

    #do the analysis in the directory where multiDFEest is stored
    if not os.path.exists("../multidfe/{0}".format(input_file_short)):
        run_process(["cp", input_file, "../multidfe"])
    MDE_output = "{0}.MAXL.out".format(input_file_short)
    current_dir = os.getcwd()
    os.chdir("../multidfe")
    arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short]
    if seed:
        seed_string = "GSL_RNG_SEED={0}".format(seed)
        arguments = [seed_string] + arguments
    arguments.extend(flags)
    print(" ".join([str(i) for i in arguments]))
    #run multiDFEest
    run_process(arguments)
    #parse output
    output = rw.read_many_fields(MDE_output, "\t")[0]
    output = [i.split(":") for i in output if ":" in i]
    output = {i[0]: float(i[1]) for i in output}
    #get the log likelihood and calculate AIC
    ll = output["L"]
    print("\n")
    print(par_number)
    print(ll)
    AIC = (2 * par_number) - (2 * ll)
    output["AIC"] = AIC
    if n_spikes:
        output["model"] = "{0}_{1}".format(model, n_spikes)
    else:
        output["model"] = model
    remove_file(MDE_output)
    os.chdir(current_dir)
    return(output)
コード例 #29
0
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False):
    '''
    Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque
    and one which positions were likely CpG/GpC in the human-macaque ancestor.
    '''
    names, seqs = rw.read_fasta(fasta)
    #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file
    #if you want to have the name of the file determined automatically
    if (not anc_CG_file_name) or (anc_CG_file_name == "None"):
        new_CG = True
        phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random())
    #if you want to give the file a name yourself
    elif not os.path.exists(anc_CG_file_name):
        new_CG = True
    else:
        new_CG = False

    if new_CG:
        print("Will get new CpG data...")
        if len(phylip_data) < 8 and comprehensive:
            print("Comprehensive CpG filtering only in big tree mode!")
            raise Exception
        #if you want to pretend some other dinucleotide are CpG
        if pseudoCG:
            CG_kmers = ["C[\-]*T", "A[\-]*G"]
        #the hyphens are there in case the two nucleotides are separated by an indel
        else:
            CG_kmers = ["C[\-]*G", "G[\-]*C"]
        CG_kmers = [re.compile(i) for i in CG_kmers]
        macaque_CG_dict = {}

        anc_CG_concat_full = [[[""]], [[""]]]
        tuples_mapping_dict_full = {}

        for chrom in chroms:

            print(chrom)

            #only leave those CDSs that are on the current chromosome
            current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom}
            coords_file = "temp_data/coords_file{0}.txt".format(random.random())

            #check if the MSA is already at the specified location, otherwise retrieve it
            MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom)
            if not os.path.isfile(MSA_file):
                print("Obtaining MSA...")
                eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file)
                os.remove(coords_file)
                eo.flush_tables("localhost", "mysql", "fackel")
            MSA_raw = eo.parse_MSA_output(MSA_file)
            if high_CG_file_name != "None":
                high_CG = rw.read_many_fields(high_CG_file_name, "\t")
                high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG}
            else:
                high_CG = None
            #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome
            anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model)
            remove_file(coords_file)
            #add that information to the global dictionaries
            anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict)
            
        phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict)
        pp_file = anc_CG_file_name

    else:
        print("Will read in existing CpG data...")
        pp_file = None
        phy_files = "None"
        high_CG = None
        tuples_mapping_dict_full = None
        macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4])
        macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t")
        macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2]
        macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1)
        macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict}
    anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive)
    [remove_file(i) for i in phy_files]
    #if you're looking at exon cores/flanks rather than full CDSs
    if regions:
        #you need to have matching bed/fasta files for this to work (with the records in the same order)
        bed = fasta.replace("fasta", "bed")
        transcripts = fs.get_transcripts()
        #for each flank/core, figure out what positions it covers in the full CDS
        mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True)
        anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict)
    if return_tuples:
        return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full)
    else:
        return(anc_CG_dict, macaque_CG_dict)