コード例 #1
0
def main():

    description = "Record the distribution of peaks for different exons."
    args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8])
    peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode

    if noncoding:
        exons = rw.read_gtf(gtf, "exon", gene=False)
    else:
        exons = rw.read_gtf(gtf, "CDS", gene=False)

    # the 3' ss that will be analyzed
    valid_junctions = rw.read_many_fields(exon_starts_file, "\t")
    # pull out the column with transcript IDs
    valid_junctions = [i[3] for i in valid_junctions]

    lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic)
    if nts_before_start:
        lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict}

    coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4])
    co.get_coverage(exon_starts_file, reads_file, coverage_file_name)

    peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode)

    write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None)

    write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict,
                                                               "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
コード例 #2
0
def main():
    description = "Take a hits file and a control file and shuffle which elements are in which."
    args = parse_arguments(description, [
        "input_hits", "input_controls", "output_hits", "output_controls",
        "hit_reduce", "control_reduce"
    ],
                           floats=[4, 5])
    input_hits, input_controls, output_hits, output_controls, hit_reduce, control_reduce = args.input_hits, args.input_controls, args.output_hits, args.output_controls, args.hit_reduce, args.control_reduce

    hits = rw.read_pos(input_hits)
    controls = rw.read_pos(input_controls)

    #if you need to reduce the hit and control position dictionary sizes by a specified proportion
    if hit_reduce > 0:
        hits = reduce_dict(hits, hit_reduce)
        controls = reduce_dict(controls, control_reduce)
        rw.write_pos(hits, output_hits)
        rw.write_pos(controls, output_controls)
    else:
        with open(output_hits, "w") as hits_o, open(output_controls,
                                                    "w") as controls_o:
            for gene in hits:
                hit_length = len(hits[gene])
                combined = hits[gene] + controls[gene]
                current_hits_o = sorted(
                    np.random.choice(combined,
                                     reduce_dictsize=hit_length,
                                     replace=False))
                current_controls_o = sorted(
                    [i for i in combined if i not in current_hits_o])
                hits_o.write("{0}\t{1}\n".format(
                    gene, ",".join([str(i) for i in current_hits_o])))
                controls_o.write("{0}\t{1}\n".format(
                    gene, ",".join([str(i) for i in current_controls_o])))
コード例 #3
0
def main():
    description = "Pick out the multi-exon genes from a dataset and generate families."
    args = parse_arguments(description, ["features_file", "genome", "dataset", "fasta"])
    [features_file, genome, dataset, fasta] = [args.features_file, args.genome, args.dataset, args.fasta]

    #set up global feature set and get relevant sequence features from it
    fs = Feature_Set(features_file, genome)
    fs.set_dataset(dataset)
    exons = fs.get_exons()
    exon_numbers = fs.get_exon_numbers(exons)

    output_fasta_name = "{0}_multiexon.fasta".format(fasta[:-6])

    #get multi-exon genes
    multi_exon = [i for i in exon_numbers if exon_numbers[i] > 1]

    #create a new feature set for multi-exon genes only
    fs_new = Feature_Set(features_file, genome)
    fs_new.create_dataset("{0}_multiexon".format(dataset), input_list = multi_exon)
    fs_new.set_dataset("{0}_multiexon".format(dataset))

    #also write a fasta with the ORF sequences
    names, seqs = rw.read_fasta(fasta)
    seqs = [seqs[pos] for pos, i in enumerate(names) if i in multi_exon]
    names = [i for i in names if i in multi_exon]
    rw.write_to_fasta(names, seqs, output_fasta_name)

    #find paralogous families
    transcripts = fs_new.get_transcripts()
    gene_name_dict = fs_new.get_gene_name_dict(transcripts)

    conservation.find_families(output_fasta_name, "general/{0}_multiexon".format(dataset))
コード例 #4
0
def main():
    description = "Run mDFEest with shuffled input to check the false positive rate."
    args = parse_arguments(description, [
        "hits_file", "controls_file", "output_file", "n_sim", "SNP_file",
        "SNP_number", "hit_reduce", "control_reduce", "const_pop"
    ],
                           ints=[3, 5],
                           floats=[6, 7],
                           flags=[8])
    hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop

    with open(output_file, "w") as file:
        for sim in range(n_sim):
            print(sim)

            temp_hits_file = "temp_data/hits_file{0}.txt".format(
                random.random())
            temp_controls_file = "temp_data/controls_file{0}.txt".format(
                random.random())
            temp_input_file = "temp_data/input_file{0}.txt".format(
                random.random())

            #shuffle hits and controls for negative control
            run_process([
                "python3", "shuffle_hits_and_controls.py", hits_file,
                controls_file, temp_hits_file, temp_controls_file, hit_reduce,
                control_reduce
            ])

            #generate multiDFEest input file
            run_process([
                "python3", "mDFEest_input.py", temp_hits_file,
                temp_controls_file, SNP_file, SNP_number, temp_input_file
            ])

            output = mDFEest("beta", temp_input_file, pop_change=True)

            print(output)
            print(output["Nes_0.0_0.1"])
            print(output["Nes_0.1_1.0"])

            file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                              output["Nes_0.1_1.0"]))

            #if you also want to run with fixed population size
            if const_pop:
                output = mDFEest("beta", temp_input_file, pop_change=False)

                file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                                  output["Nes_0.1_1.0"]))

            file.write("\n")

            remove_file(temp_hits_file)
            remove_file(temp_controls_file)
            remove_file(temp_input_file)
コード例 #5
0
def main():

    description = "Filter an osc file to only contain the samples that you want and format it as a bed file so that you could lift over the coordinates."
    arguments = ["input_file_name", "output_file_name", "filter_samples"]
    args = parse_arguments(description, arguments, flags = [2])
    input_file_name, output_file_name, filter_samples = [args.input_file_name, args.output_file_name, args.filter_samples]

    #this is all the pooled ones except for all the brain subregion ones which I removed because otherwise like
    #11/40 would have been brain tissues. I left in the retina though.
    ones_I_want = ['of adipose tissue, adult, pool1', 'of adrenal gland, adult, pool1', 'of aorta, adult, pool1', 'of bladder, adult, pool1', 'of blood, adult, pool1', 'of brain, adult, pool1', 'of cervix, adult, pool1', 'of colon, adult, pool1', 'of esophagus, adult, pool1', 'of heart, adult, pool1', 'of kidney, adult, pool1', 'of liver, adult, pool1', 'of lung, adult, pool1', 'of ovary, adult, pool1', 'of placenta, adult, pool1', 'of prostate, adult, pool1', 'of retina, adult, pool1', 'of salivary gland, adult, pool1', 'of skeletal muscle, adult, pool1', 'of small intestine, adult, pool1', 'of smooth muscle, adult, pool1', 'of spleen, adult, pool1', 'of testis, adult, pool1', 'of thymus, adult, pool1', 'of thyroid, adult, pool1', 'of tonsil, adult, pool1', 'of trachea, adult, pool1', 'of uterus, adult, pool1']

    IDs = []
    indices = []
    full_IDs = []

    counter = 0
    
    with open(input_file_name) as file, open(output_file_name, "w") as output_file:
        for line in file:
            counter = counter + 1
            if counter % 1000 == 0:
                print(counter)
            if line[0] == "#":
                if filter_samples:
                    if "adult, pool1" in line:
                        for search in ones_I_want:
                            if search in line:
                                ID = re.findall("CNhs[\d\.\-\w]*", line)[0]
                                IDs.append(ID)
            elif line[:6] == "00Anno":
                if filter_samples:
                    line = line.split("\t")
                    for pos, elem in enumerate(line):
                        for ID in IDs:
                            if ID in elem:
                                indices.append(pos)
                                full_IDs.append(elem)
            elif line[:3] == "chr":
                #I'm going to pretend that the actual data bit is just
                #the name of the bed record so it would survive the CrossMapping
                line = line.split("\t")
                coords = line[0]
                line[-1] = line[-1].rstrip("\n")
                if filter_samples:
                    line = [line[i] for i in indices]
                else:
                    line = line[1:]
                coords = coords.split("..")
                chrom = coords[0].split(":")[0]
                start = coords[0].split(":")[1]
                end = coords[1].split(",")[0]
                strand = coords[1].split(",")[1]
                name = "|".join(line)
                output_line = [chrom, start, end, name, ".", strand]
                output_file.write("\t".join(output_line))
                output_file.write("\n")
コード例 #6
0
ファイル: dinucl_cons.py プロジェクト: rosinaSav/RBP_motifs
def main():
    description = "Calculate the conservation of a set of motifs separately for each dinucleotide."
    args = parse_arguments(description, ["features_file_name", "dataset_name", "genome", "RBP_file_name", "correspondances_file_name", "fasta_file_name", "families_file_name", "output_file_name", "alignment_folder_name", "flanks"], flags = [9])
    [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, fasta_file_name, families_file_name, output_file_name, alignment_folder_name, flanks] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.alignment_folder_name, args.flanks]

    #prepare an object for storing the genome annotations associated to the sequences in the sequence file
    fs = Feature_Set(features_file_name, genome)
    fs.set_dataset(dataset_name)
    #make a dictionary with RBPs as keys and lists of associated motifs as values
    motif_dict = rw.read_motifs(RBP_file_name)
    transcripts = fs.get_transcripts()
    gene_name_dict = fs.get_gene_name_dict(transcripts)

    #if working with full CDSs
    if not flanks:
        #pick a random memebr from each paralogous family
        families = rw.read_families(families_file_name)
        families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        picked_trans = fs.pick_random_members()
        picked = []
        for i in picked_trans:
            for j in gene_name_dict:
                if gene_name_dict[j][0][4] == i:
                    picked.append(j)
        print(len(picked))
        map_from_regions = None
    #if working with exon subregions
    #my exon subregions file already has regions from only one transcript per paralogous family
    else:
        picked = None
        CDS = fs.get_CDS()
        bed_file_name = "{}.bed".format(fasta_file_name)
        fasta_file_name = "{0}.fasta".format(fasta_file_name)
        map_from_regions = conservation.map_regions_to_CDS(fasta_file_name, bed_file_name, fs, gene_name_dict, CDS)       

    #generate all possible DNA dinucleotides
    dinucl = nc.generate_all_kmers(2)

    motifs = flatten(list(motif_dict.values()))

    with open(output_file_name, "w") as file:
        file.write("dinucleotide\tmotif rate\tmotif frequency\tnonmotif rate\tnonmotif frequency\n")
        #calculate the rate of evolution wihtin vs outside of motifs separately for each dinucleotide
        freqs_dict = conservation.cons_by_dinucl(fasta_file_name, motifs, correspondances_file_name, alignment_folder_name, dinucl, picked = picked, map_from_regions = map_from_regions)
        for dint in sorted(list(freqs_dict.keys())):
            if (freqs_dict[dint]["subst. in motifs"] != None) and (freqs_dict[dint]["subst. in non-motifs"] != None):
                to_write = [dint, freqs_dict[dint]["subst. in motifs"], freqs_dict[dint]["frequency in motifs"], freqs_dict[dint]["subst. in non-motifs"], freqs_dict[dint]["frequency in non-motifs"]]
                to_write = "\t".join([str(i) for i in to_write])
                file.write(to_write)
                file.write("\n")
        #get an over-all estimate by taking a weighted avergae (weighted by dinucleotide frequency) of the frequencies of all the different dinucleotides
        output_dict = conservation.weight_cons_by_dinucl(freqs_dict, dinucl)
        print(output_dict)
def main():
    description = "Given a BED file of reads, filter out reads whose " \
                  "3' end maps to the last nucleotide of an intron or" \
                  "the last nucleotide of an exon."
    args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"])
    reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile

    print("Getting intron lariat positions...")

    # read in exon coordinates
    exons = rw.read_gtf(gtf, element="exon", gene=False)
    # make a BED file with the last positions of introns
    intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True)

    # intersect the reads with intron lariat positions
    intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name)
    hk.remove_file(intron_lariat_bed)
    intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at intron lariat positions
    check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file)
    hk.remove_file(intron_lariat_intersect_file_name)

    # write BED with the last positions of exons
    splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4])
    co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True)

    print("Getting splice intermediate positions.")

    # intersect the reads with splice intermediate positions
    splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name)
    hk.remove_file(splice_intermediate_bed)
    SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at the end of the exon
    check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file)
    hk.remove_file(splice_intermediate_intersect_file_name)

    print("Concatenating the two files.")

    # concatenate the IL and SI read files so you could exclude both in one go
    combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4])
    hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file)

    hk.remove_file(SI_reads_file)
    hk.remove_file(intron_lariat_reads_file)

    # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the
    # putative intron lariat reads from the main reads file
    co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile)

    hk.remove_file(combined_file)
コード例 #8
0
def main():

    # Get arguments.
    description = "Check if nucleotide composition at the 5' ends of NET-seq reads is biased."
    args = hk.parse_arguments(
        description,
        ["input_file", "output_file", "genome_fasta", "gtf", "three_prime"],
        flags=[4])
    input_file, output_file, genome_fasta, gtf, three_prime = args.input_file, args.output_file, args.genome_fasta, args.gtf, args.three_prime

    # Convert to .bed, if not already .bed
    if input_file[-3:] != "bed":
        print("Converting input file to .bed...")
        input_file_new_name = "{0}bed".format(input_file[:-3])
        hk.convert2bed(input_file, input_file_new_name)
        input_file = input_file_new_name

    # Make an extended version of each read that extends 5 nt 5prime and 35 nt 3prime
    print("Extending the reads...")
    suffix = ""
    if three_prime:
        suffix = "_three_prime"
    temp_bed = "{0}_extended_for_bias{1}.bed".format(input_file[:-4], suffix)
    co.extend_intervals(input_file,
                        temp_bed,
                        5,
                        35,
                        remove_chr=True,
                        add_chr=False,
                        three_prime=three_prime)

    # Make a FASTA file from the BED file.
    print("Extracting sequences...")
    fasta_name = "{0}fasta".format(temp_bed[:-3])
    hk.run_process([
        "fastaFromBed", "-bed", temp_bed, "-fi", genome_fasta, "-fo",
        fasta_name, "-s"
    ])
    print("Number of lines in FASTA:")
    print(hk.run_process(["wc", "-l", fasta_name]))

    # Store the sequences at -5:+5 and 30:40 in a 2D array
    print("Storing sequences in arrays...")
    occ_mat_true, occ_mat_control = extract_true_and_control_string(
        fasta_name, (0, 10), (30, 40))

    # Make a PPM for either column
    bases = ["A", "T", "C", "G"]
    print("Making PPMs...\n")
    print("TRUE:")
    PPM_wrapper(occ_mat_true, bases, "{0}.true".format(output_file))
    print("CONTROL:")
    PPM_wrapper(occ_mat_control, bases, "{0}.control".format(output_file))
コード例 #9
0
def main():

    description = "Write the median motif lengths of a series of motif sets to file."
    args = parse_arguments(description, ["input_file", "output_file"])
    [input_file, output_file] = [args.input_file, args.output_file]
    
    #parse motifs from FASTA
    names, motifs = rw.read_fasta(input_file)
    motifs = [i.split("|") for i in motifs]
    motif_lengths = [[len(j) for j in i] for i in motifs]
    #write down and print out motif lengths
    with open(output_file, "w") as file:
        for pos, lengths_list in enumerate(motif_lengths):
            file.write("{0}\t{1}\n".format(names[pos], np.median(lengths_list)))
            print(np.median(lengths_list))
コード例 #10
0
def main():
    description = "Make a file with intron lariat read counts per exon."
    args = hk.parse_arguments(description,
                              ["intron_lariat_file", "regions_file"])
    intron_lariat_file, regions_file = args.intron_lariat_file, args.regions_file

    # the intron_lariat_file contains only those reads whose
    # 3' ends map to the last position of an intron
    snr_name = "{0}_snr.bed".format(intron_lariat_file[:-4])
    co.snr_bed(intron_lariat_file, snr_name)

    co.intersect_bed(regions_file,
                     snr_name,
                     force_strand=True,
                     hit_count=True,
                     no_dups=False,
                     output_file="{0}_il_counts.bed".format(regions_file[:-4]))
コード例 #11
0
def main():
    description = "Take a positions file with hits within exonic subregions and convert them to full CDS indices."
    args = parse_arguments(description, [
        "positions_file", "bed_file", "genome", "features_file", "dataset",
        "output_file"
    ])
    positions_file, bed_file, genome, features_file, dataset, output_file = args.positions_file, args.bed_file, args.genome, args.features_file, args.dataset, args.output_file

    #set up data
    fs = setup(features_file, genome, dataset)
    CDSs = fs.get_CDS()

    pos_dict = rw.read_pos(positions_file)

    #do actual work
    converted_pos = convert_region_to_CDS_func(pos_dict, bed_file, CDSs)

    #write output to file
    rw.write_pos(converted_pos, output_file)
コード例 #12
0
def main():

    description = "Extract exon end/core regions from feature set."
    args = parse_arguments(
        description,
        ["genome", "features_file", "families_file", "dataset", "start_only"],
        flags=[4])
    genome, features_file, families_file, dataset, start_only = args.genome, args.features_file, args.families_file, args.dataset, args.start_only

    genome = "hg38"
    features_file = "general/Homo_sapiens.GRCh38.85.gtf"
    families_file = "general/filtered_hg38_85_pc_multiexon_families.txt"
    dataset = "filtered_hg38_85_pc_multiexon"

    #prepare feature set, get necessary genomic features
    fs = setup(features_file, genome, dataset, families_file=families_file)
    exons = fs.get_exons()
    CDS = fs.get_CDS()

    #pick a random member from each family
    picked = fs.pick_random_members()
    exons = {i: exons[i] for i in picked}
    CDS = {i: CDS[i] for i in picked}

    if start_only:
        #only the 5' end
        fs.get_exon_beginnings(exons,
                               CDS,
                               file_prefix="general/{0}".format(dataset),
                               write_to_fasta=True)
    else:
        #both flanks and the core
        fs.get_exon_cores_and_flanks(exons,
                                     CDS,
                                     file_prefix="general/{0}".format(dataset),
                                     write_to_fasta=True)
コード例 #13
0
def main():

    description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs."
    args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9])
    fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral

    names, seqs = rw.read_fasta(fasta)

    #I use two different formats for storing sequence motifs,
    #got to know which on it is
    if old_motif_format:
        motifs = rw.read_names(motif_file)[1:]
        print(len(motifs))
    else:
        motifs = rw.read_motifs(motif_file)
        motifs = sorted(list(set(flatten(list(motifs.values())))))

    #get the lengths of the motifs and compile lookahead regexes
    #that recognize the whole motif but only store the position of the first bases
    #these will be needed when searchin for the motifs
    motif_lengths = [len(i) for i in motifs]
    motif_regex = nc.motif_to_regex(motifs)

    #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say,
    #ESE motifs
    CG_2mers = ["CG", "GC"]
    CG_lengths = [2, 2]
    CG_regex = nc.motif_to_regex(CG_2mers)

    motifs = [list(i) for i in motifs]

    if ancestral:
        anc_pos = rw.read_pos(anc_file)

    #read in hit and control positions
    controls = rw.read_pos(control_file)
    hit_file = re.sub("controls", "hits", control_file)
    hits = rw.read_pos(hit_file)

    #read in SNP data
    SNPs = rw.read_many_fields(SNPs_file, "\t")
    #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data
    to_remove = list_to_dict(SNPs, 0, 2)
    to_remove = {i: to_remove[i].split(",") for i in to_remove}
    to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove}
    SNPs = list_to_dict(SNPs, 0, 1)

    #all the SNPs associated to a transcript
    full_SNPs = {}
    #disruptive SNPs only
    clean_SNPs = {}
    minor_alleles = {}

    #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions)
    transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}
    #the same as above but only counting those substitutions that would turn a motif into a non-motif
    transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_}

    #this block of code filters the true SNPs to only leave those that are disruptive
    #and also calculates the probability of being disruptive for all potential SNPs
    with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file:
        counter = 0
        for trans in names:
            counter = update_counter(counter, 1000)
            if trans in controls:
                if trans in SNPs:
                    trans_SNPs = SNPs[trans]
                else:
                    trans_SNPs = []
                trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans)
                current_seq = seqs[names.index(trans)]
                fourfold_pos = nc.get_4fold_deg(current_seq)
                #CpG filtering
                if human:
                    CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"]
                    fourfold_pos = [i for i in fourfold_pos if i not in CG_pos]
                if ancestral:
                    fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]]
                all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove)
                hit_degen_file.write("\n")

    to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls}

    hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N)

    transitions = get_transitions(transitions_disr, transitions_total)
    print(transitions)

    #this block randomly assigns certain SNPs at simulant positions to be disruptive,
    #with the probability of that happening proportional to the frequency with which potential substitutions
    #of that nucleotide composition would be disruptive for true (motif) sites
    with open("{0}_degen.txt".format(control_file), "w") as control_degen_file:
        control_SNPs = {}
        counter = 0
        for trans in controls:
            control_degen_file.write("{0}\t".format(trans))
            counter = update_counter(counter, 1000)
            control_SNPs[trans] = {}
            trans_SNPs = full_SNPs[trans]
            current_seq = seqs[names.index(trans)]
            for site in controls[trans]:
                if trans not in to_remove or site not in to_remove[trans]:
                    ref_allele = current_seq[site]
                    disrupt_bases = get_disrupt_bases(ref_allele, transitions)
                    control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases)))
                    if site in trans_SNPs:
                        minor_allele = minor_alleles[trans][site]
                        if minor_allele in disrupt_bases:
                            control_SNPs[trans][site] = trans_SNPs[site]
            control_degen_file.write("\n")

    control_SFS = get_SFS(controls, control_SNPs, to_remove, N)

    with open(output_file, "w") as file:
        file.write("{0}\n".format(N))
        file.write(" ".join([str(i) for i in hit_SFS]))
        file.write("\n")
        file.write(" ".join([str(i) for i in control_SFS]))
        file.write("\n")    
コード例 #14
0
ファイル: mnase_bias.py プロジェクト: rosinaSav/dNETseq_code
def main():
    description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \
                  "as the true set."
    args = hk.parse_arguments(description, [
        "active_genes_file", "gtf", "PolII_file", "fasta", "outfile",
        "chrom_sizes"
    ])
    active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes

    chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t")
    chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True)

    # get transcriptionally active genes and make a BED file with their coordinates
    print("Getting the coordinates of transcriptionally active genes...")
    trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:]
    trans_active_genes = [i[3] for i in trans_active_genes]
    transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)

    transcripts_dict = {}
    # this will be used for getting the k-mers in the transcripts
    filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format(
        transcripts_file[:-4])
    # this will be used for filtering the reads
    filtered_transcripts_file = "{0}_trans_act_only.bed".format(
        transcripts_file[:-4])
    with open(filtered_transcripts_file,
              "w") as ft_file, open(transcripts_file) as t_file, open(
                  filtered_transcripts_file_plus2, "w") as ft_file2:
        reader = csv.reader(t_file, delimiter="\t")
        writer = csv.writer(ft_file, delimiter="\t")
        writer2 = csv.writer(ft_file2, delimiter="\t")
        for line in reader:
            if line[3] in trans_active_genes:
                # if line[0][0] not in ["G", "K"]:
                #     line[0] = "chr{0}".format(line[0])
                writer.writerow(line)
                # this is because if a read falls at the first position, you will need to know the
                # preceding two bases. Same if it falls at the last position.
                line[1] = str((int(line[1])) - 3)
                line[2] = str((int(line[2])) + 3)
                writer2.writerow(line)
                transcripts_dict[line[3]] = line

    print("Filtering reads to the transcripts...")
    # filter reads to only ones that overlap these transcripts
    transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4])
    co.intersect_bed(PolII_file,
                     filtered_transcripts_file,
                     force_strand=True,
                     output_file=transcripts_PolII)

    print("Extracting FASTA from the transcript coordinates...")
    # the genome FASTA is formatted as N rather than chrN
    filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format(
        transcripts_file[:-4])
    hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2],
                   file_for_output=filtered_transcripts_file_no_chr)
    filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format(
        transcripts_file[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed",
        filtered_transcripts_file_no_chr, "-fo",
        filtered_transcripts_fasta_no_chr, "-s", "-name"
    ])

    print("Mapping kmers to transcript positions...")
    kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr,
                                       k=6,
                                       focal_pos=3)

    print("Extracting the starting dinucleotide for each read...")
    starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format(
        PolII_file[:-4])
    starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format(
        PolII_file[:-4])
    co.extend_intervals(transcripts_PolII,
                        starting_dints_PolII,
                        3,
                        3,
                        remove_chr=True)
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII,
        "-fo", starting_dints_PolII_fasta, "-s"
    ])

    print("Picking random control positions...")
    pick_random_positions(transcripts_PolII,
                          starting_dints_PolII_fasta,
                          outfile,
                          kmer_dict,
                          transcripts_dict,
                          chrom_sizes=chrom_sizes)

    print("Making single nucleotide resolution file...")
    snr_file = "{0}_snr.bed".format(outfile[:-4])
    co.snr_bed(outfile, snr_file)

    print(
        "Removing reads that overlap potential splice intermediate positions..."
    )
    no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4])
    co.intersect_bed(snr_file,
                     "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf",
                     force_strand=True,
                     exclude=True,
                     no_dups=False)
コード例 #15
0
def main():
    description = "Prepare input file for running MultiDFEest."
    args = parse_arguments(description, [
        "hit_file", "control_file", "SNPs_file_prefix", "N", "output_file",
        "per_chrom_files", "shuffle"
    ],
                           ints=[3],
                           flags=[5, 6])
    hit_file, control_file, SNPs_file_prefix, N, output_file, per_chrom_files, shuffle = args.hit_file, args.control_file, args.SNPs_file_prefix, args.N, args.output_file, args.per_chrom_files, args.shuffle

    hits = parse_pos(hit_file)
    controls = parse_pos(control_file)

    if shuffle:
        hits, controls = shuffle_dictionaries(hits, controls)

    SNPs = {}
    to_remove_all = {}
    #if the data is stored chromosome by chromosome, rather than all combined
    if per_chrom_files:
        for chrom in range(1, 23):
            try:
                SNPs_file = "{0}{1}.bed".format(SNPs_file_prefix, str(chrom))
                current_SNPs = rw.read_many_fields(SNPs_file, "\t")
                to_remove = list_to_dict(current_SNPs, 0, 2)
                to_remove = {i: to_remove[i].split(",") for i in to_remove}
                current_SNPs = list_to_dict(current_SNPs, 0, 1)
                for trans in current_SNPs:
                    if trans in controls:
                        SNPs[trans] = {}
                        trans_SNPs = current_SNPs[trans]
                        if trans_SNPs:
                            trans_SNPs = [
                                i.split(",") for i in trans_SNPs.split("|")
                            ]
                            #this is where you get the allele count
                            trans_SNPs = list_to_dict(trans_SNPs, 0, 3)
                            trans_SNPs = {
                                int(i): int(trans_SNPs[i])
                                for i in trans_SNPs
                            }
                            SNPs[trans] = trans_SNPs
                        to_remove_all[trans] = [
                            int(i) for i in to_remove[trans]
                            if i not in ["error", ""]
                        ]
            except FileNotFoundError:
                pass
    else:
        SNPs_file = SNPs_file_prefix
        current_SNPs = rw.read_many_fields(SNPs_file, "\t")
        to_remove = list_to_dict(current_SNPs, 0, 2)
        to_remove = {i: to_remove[i].split(",") for i in to_remove}
        current_SNPs = list_to_dict(current_SNPs, 0, 1)
        counter = 0
        for trans in current_SNPs:
            if trans in controls:
                SNPs[trans] = {}
                trans_SNPs = current_SNPs[trans]
                if trans_SNPs:
                    trans_SNPs = [i.split(",") for i in trans_SNPs.split("|")]
                    #this is where you get the allele count
                    trans_SNPs = list_to_dict(trans_SNPs, 0, 3)
                    trans_SNPs = {
                        int(i): int(trans_SNPs[i])
                        for i in trans_SNPs
                    }
                    SNPs[trans] = trans_SNPs
                to_remove_all[trans] = [
                    int(i) for i in to_remove[trans] if i not in ["error", ""]
                ]

    hit_SFS = get_SFS(hits, SNPs, to_remove_all, N)
    control_SFS = get_SFS(controls, SNPs, to_remove_all, N)

    with open(output_file, "w") as file:
        file.write("{0}\n".format(N))
        file.write(" ".join([str(i) for i in hit_SFS]))
        file.write("\n")
        file.write(" ".join([str(i) for i in control_SFS]))
        file.write("\n")
コード例 #16
0
def main():

    description = "Calculate the combined density of a set of motif sets."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22])
    [motifs_file_name, summary_file_name, dataset_name,  correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name,  args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml]

    #make a dictionary with RBPs as keys and ND/p values as values.
    if summary_file_name != "None":
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        #because some of the files are tab-separated, while others are comma-separated and have a header row
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")
            summary_data = summary_data[1:]

        summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True)
            
    #make a dictionary with RBPs as keys and lists of associated motifs as values        
    motifs = rw.read_motifs(motifs_file_name)

    #if you only want to be using a subset of the motifs
    if not full_set:
        #which RBPs fulfill the necessary information content criteria?
        validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t")
        validity = list_to_dict(validity, 0, 1)
        #motifs with negative ND
        if negative_ND:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")]
        #the most significantly enriched motifs
        elif upper_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")]
        #the most significantly depleted motifs
        elif lower_quarter:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")]
        #motifs with positive ND
        else:
            motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")]

    #shove all the remaining motifs into a great big flattened and uniquified bag
    motifs = list(set(flatten(list(motifs.values()))))

    make_dir(output_folder_name)

    #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers)
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        transcripts = fs.get_transcripts()
        CDS = fs.get_CDS()
        #paralogous families
        families = rw.read_families(families_file_name)
        #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers
        if gene_families:
            families = fs.convert_families_to_ENST(families, transcripts)
        fs.add_families(families)
        #pick a random member from each paralogous family
        picked_trans = fs.pick_random_members()
        names = rw.read_fasta(fasta_name)[0]
        if picked_trans[0] not in names:
            picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans]
        else:
            picked = picked_trans
        print(len(picked))
    else:
        picked = None

    if baseml:
        method = "baseml"
    else:
        method = "gy"

    #write the input data for the conservation analysis into a file
    input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random())
    conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked)
    with open(output_file_name, "w") as file:
        file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"]))
        file.write("\n")
        #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria
        if new_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1)
        elif newer_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False)               
        else:
            simulants = nc.make_simulants(motifs, n_sim, seed = 100)
        #file where the simulants dS values will be stored
        sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix)
        #calculate dS within motifs and simulants
        output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method)
        print(output_dict)
        print("\n")
        #write to output file
        if output_dict != None:
            file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))]))
        else:
            file.write(",".join([str(None), str(None), str(None), str(None), str(None)]))
    os.remove(input_dict_file_name)
コード例 #17
0
def main():

    description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14])
    [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name,  dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP]

    RBPs = rw.read_motifs(motifs_file_name)

    #only leave those RBPs hat pass information content criteria
    validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t")
    validity = list_to_dict(validity, 0, 1)
    RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"}

    #if you're not doing this by RBP, pool motifs from the most significantly depleted sets
    if not by_RBP:
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")    
        summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True)            
        RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9}
        motifs = list(set(flatten(list(RBPs.values()))))
        RBPs = {"all": motifs}

    #randomly pick one gene from each paralogous family
    fs = Feature_Set(features_file_name, genome)
    fs.set_dataset(dataset_name)
    transcripts = fs.get_transcripts()
    families = rw.read_families(families_file_name)
    families = fs.convert_families_to_ENST(families, transcripts)
    fs.add_families(families)
    picked_from_families = fs.pick_random_members()
    gene_name_dict = fs.get_gene_name_dict(transcripts)
    picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families]

    names, CDS = rw.read_fasta(sequences_file_name)

    #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species
    correspondances = rw.read_many_fields(correspondances_file_name, ",")
    correspondance_dict = {}
    for i in correspondances:
        correspondance_dict[i[0]] = i[1]

    output_dict = {}

    #loop over the RBPs
    for protein in sorted(RBPs):

        #fetch the current motifs
        print(protein)
        motifs = RBPs[protein]
        print("There are {0} motifs.".format(len(motifs)))
        #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set
        neighbours = nc.get_neighbours(motifs)
        print("There are {0} neighbours.".format(len(neighbours)))            

        #make simulants for the motifs. don't allow simulants to be part of the set of neighbours.
        simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False)

        neighbour_lengths = [len(i) for i in neighbours]        
        neighbours = nc.motif_to_regex(neighbours)

        #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that
        #would give rise to the motif in the orthologous species
        site_number = 0
        mutation_score = 0
        motifs = [list(i) for i in motifs]
        true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) 
        for i in true_result:
            current = i.get()
            site_number = site_number + current[0]
            mutation_score = mutation_score + current[1]
        if site_number > 0:
            real_fraction = mutation_score/site_number
        else:
            real_fraction = None
        print("Real fraction:")
        print(real_fraction)

        neighbours = ""      
        sim_site_numbers = np.zeros((n_sim))
        sim_mutation_scores = np.zeros((n_sim))

        #obtain this estimate also for each simulant set
        #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once
        for sim in range(n_sim):
            if sim%10 == 0:
                print(sim)
            current_simulants = simulants[sim]
            current_neighbours = nc.get_neighbours(current_simulants)
            current_neighbour_lengths = [len(i) for i in current_neighbours]        
            current_neighbours = nc.motif_to_regex(current_neighbours)
            current_simulants = [list(i) for i in current_simulants]
            current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif)
            for i in current_result:
                current = i.get()
                sim_site_numbers[sim] = sim_site_numbers[sim] + current[0]
                sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1]

        #normalize the real fraction, calculate p
        sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers)
        sim_fractions = [i for i in sim_fractions if i != np.inf]
        p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False)
        norm_fraction = ms.normalize(real_fraction, sim_fractions) 

        output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction]
        print(output_dict[protein])
        
    with open(output_file_name, "w") as output_file:
        #write header to output file
        output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n")
        #write the rest of the output data
        for protein in sorted(list(output_dict.keys())):
            to_write = output_dict[protein]
            to_write = [str(i) for i in to_write]
            output_file.write("\t".join(to_write))
            output_file.write("\n")
コード例 #18
0
def main():

    description = "Calculate the combined density of a set of motif sets."
    args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "seed", "output_suffix", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "newer_filters", "two_seqs"], ints = [5, 10, 11], flags = [13, 14, 15, 16, 17, 18, 19])
    [motifs_file_name, summary_file_name, dataset_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, seed, output_suffix, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, newer_filters, two_seqs] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.seed, args.output_suffix, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.newer_filters, args.two_seqs]

    #make a dictionary with RBPs as keys and ND/p values as values.
    if summary_file_name != "None":
        summary_data = rw.read_many_fields(summary_file_name, "\t")
        #because some of the files are tab-separated, while others are comma-separated and have a header row
        if len(summary_data[0]) == 1:
            summary_data = rw.read_many_fields(summary_file_name, ",")
            summary_data = summary_data[1:]

        summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True)

    #make a dictionary with RBPs as keys and lists of associated motifs as values        
    motifs = rw.read_motifs(motifs_file_name)

    #if you only want to be using a subset of the motifs
    if not full_set:
        #motifs with negative ND
        if negative_ND:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0]
        #the most significantly enriched motifs
        elif upper_quarter:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0.1]
        #the most significantly depleted motifs
        elif lower_quarter:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] > 0.9]
        #motifs with positive ND
        else:
            motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] >= 0]

    #shove all the remaining motifs into a great big flattened and uniquified bag
    motifs = list(set(flatten(list(motifs.values()))))

    print(len(motifs))
    make_dir(output_folder_name)

    #if you want to average over families
    if features_file_name != "None":
        fs = Feature_Set(features_file_name, genome)
        fs.set_dataset(dataset_name)
        families = rw.read_families(families_file_name)
        fs.add_families(families)
    else:
        fs = None

    #generate 100 1000 bp long random sequences based on the hg38 mononucleotide composition and use that as your sequence fasta
    if fasta_name == "random":
        names = [i for i in range(100)]
        seqs = nc.kmers_from_nc(1000, 100, genome_comp = True)
        fasta_name = "RBP/random_sequences_from_genome_comp.fasta"
        rw.write_to_fasta(names, seqs, fasta_name)

    with open(output_file_name, "w") as output_file:
        #generate n_sim sets of simulant motifs (constraining the space of simulants based on different sets of filters)
        if new_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed)
        elif newer_filters:
            simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed, concat = False, no_duplicates = True)
        else:
            current_simulants = nc.make_simulants(motifs, n_sim, seed = seed)
        #calculate the density parameters of the motifs in the sequence fasta
        output_dict = nc.get_sequence_set_density(fasta_name, None, motifs, simulants, n_sim,
                                                   "{0}/overall_density_{1}.csv".format(output_folder_name, output_suffix),
                                                   "{0}/overall_sim_density_{1}.csv".format(output_folder_name, output_suffix),
                                                   "{0}/overall_positions.csv_{1}".format(output_folder_name, output_suffix),
                                                   "{0}/overall_sim_positions_{1}".format(output_folder_name, output_suffix),
                                                   concat = False, positions = False, feature_set = fs, verbose = True, two_seqs = two_seqs)
        record = [str(output_dict["median density"]), str(np.mean(output_dict["simulated densities"])), str(output_dict["median ND"]), str(output_dict["effective p"]), str(output_dict["Z"]), str(output_dict["depletion p"]), str(len(motifs)), str(output_dict["simulant sd"])]
        #write to output file
        output_file.write("\t".join(record))
        print(record)
コード例 #19
0
ファイル: peak_caller.py プロジェクト: rosinaSav/dNETseq_code
def main():

    description = "Call peaks in a BED file of NET-seq reads."
    help_info = [
        "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).",
        "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!",
        "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.",
        "Name of the output file (BED file with peak coordinates).",
        "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.",
        "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.",
        "Minimum reads per peak. Default: 10.",
        "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.",
        "Minimum length of a peak in nucleotides. Default: 5.",
        "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21",
        "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.",
        "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.",
        "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.",
        "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.",
        "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).",
        "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)"
    ]
    defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1}
    args = hk.parse_arguments(description, [
        "reads_file", "gtf", "trans_active_file", "output_file",
        "significance_threshold", "merge", "min_reads_per_peak", "iterations",
        "min_peak_length", "window_size", "runs", "neg_control", "no_slide",
        "exclude_focal", "with_ups_intron", "no_PCR_filter"
    ],
                              floats=[4],
                              ints=[5, 6, 7, 8, 9, 10],
                              flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                              detailed_help=help_info,
                              defaults=defaults)
    reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter

    print("Merge distance: {0}".format(merge))
    print("Minimum number of reads per peak: {0}".format(min_reads_per_peak))
    print("Minimum peak length: {0}".format(min_peak_length))
    print("Window size: {0}".format(window_size))
    print("Significance level: {0}".format(significance_threshold))
    print("Randomization iterations to perform: {0}".format(iterations))
    print("Runs: {0}".format(runs))

    neg_str = ""
    if neg_control:
        neg_str = "_neg_control"

    slide_str = ""
    if no_slide:
        slide_str = "_no_slide"
    intron_str = ""
    if with_ups_intron:
        intron_str = "w_ups_intr"

    # 0. make a BED file with the coordinates of transcripts

    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)
    exons = rw.read_gtf(gtf, "exon")

    # 1. intersect the two files, loop over the result and make a
    # dictionary of reads per pos for each transcript, which has reads

    reads_per_pos = get_reads_per_pos(reads_file, transcripts_file)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    reads_per_pos = {
        i: reads_per_pos[i]
        for i in reads_per_pos if i.split(".")[-1] in trans_active_genes
    }

    for sim in range(runs):

        print("**********{0}**********".format(sim))

        # 2. for each transcript, randomly reshuffle the reads and calculate the
        # nth percentile depending on what the significance threshold is
        # keep positions that are higher than that threshold and write to BED file

        raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, min_reads_per_peak,
            window_size, neg_str, intron_str, slide_str, sim)
        read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, neg_str,
            intron_str, sim)
        new_reads_file = write_raw_peaks(reads_per_pos,
                                         raw_peak_bed,
                                         read_count_file,
                                         exons,
                                         iterations=iterations,
                                         min_read_count=min_reads_per_peak,
                                         window_size=window_size,
                                         neg_control=neg_control,
                                         no_slide=no_slide,
                                         exclude_focal=exclude_focal,
                                         with_ups_intron=with_ups_intron)
        if neg_control:
            reads_file = new_reads_file

        # 3. merge peaks

        merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str,
            slide_str, intron_str, sim)
        co.merge_bed(raw_peak_bed, merged_peak_bed, merge)
        print("Before filtering, there are {0} peaks.".format(
            hk.line_count(merged_peak_bed)))

        # 4. filter out peaks that don't have enough reads or are too short.
        # Write final results to file and also write a stats file with the size,
        # read count and overlapping transcript of the peaks

        stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim)
        filter_peaks(merged_peak_bed,
                     reads_file,
                     read_count_file,
                     "{0}_{1}_sim.bed".format(output_file[:-4], sim),
                     min_reads_per_peak,
                     min_peak_length,
                     stats_file,
                     no_PCR_filter=no_PCR_filter)
コード例 #20
0
ファイル: readthrough.py プロジェクト: rosinaSav/dNETseq_code
def main():
    description = "Prepare a BED file with the TES coordinates of transcriptionally" \
                  "active genes and make a metagene of reads within this region."

    args = hk.parse_arguments(description, ["trans_act_file", "gtf", "start_coord", "end_coord", "outname", "reads_file"], ints = [2, 3])
    trans_act_file, gtf, start_coord, end_coord, outname, reads_file = args.trans_act_file, args.gtf, args.start_coord, args.end_coord, args.outname, args.reads_file

    trans_act_genes = []
    with open(trans_act_file) as f:
        reader = csv.reader(f, delimiter = "\t")
        for line in reader:
            trans_act_genes.append(line[3])

    exons = rw.read_gtf(gtf, "exon")
    CDSs = rw.read_gtf(gtf, "CDS")

    exons = {i: exons[i] for i in exons if i in trans_act_genes}
    # protein-coding only
    exons = {i: exons[i] for i in exons if i in CDSs}

    ds_500 = "{0}_ds_500.bed".format(outname[:-4])
    with open(outname, "w") as out, open(ds_500, "w") as out_ds:
        writer = csv.writer(out, delimiter="\t")
        writer_ds = csv.writer(out_ds, delimiter="\t")
        for trans in exons:
            strand = exons[trans][0][6]
            chrom = "chr{0}".format(exons[trans][0][0])
            if strand == "+":
                TES = exons[trans][-1][4]
                new_start = TES - start_coord
                new_end = TES + end_coord
                new_start_ds = TES
                new_end_ds = TES + 500
            else:
                TES = exons[trans][-1][3]
                new_start = TES - start_coord - 1
                new_end = TES + start_coord - 1
                new_start_ds = TES - 500 - 1
                new_end_ds = TES - 1
            writer.writerow([chrom, new_start, new_end, trans, "0", strand])
            chrom = chrom.lstrip("chr")
            writer_ds.writerow([chrom, new_start_ds, new_end_ds, trans, "0", strand])

    intersect = "{0}_ds500_intersect.bed".format(outname[:-4])
    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.intersect_bed(ds_500, transcripts_file, write_both = True, force_strand=False, no_dups = False, output_file=intersect)

    co.get_transcripts(gtf, transcripts_file, with_detail=True)
    mapping = co.transcript_mapping(transcripts_file)

    to_exclude = []
    with open(intersect) as int_file:
        reader = csv.reader(int_file, delimiter = "\t")
        for line in reader:
            strand = line[5]
            curr_gene = mapping[line[3]]
            other_gene = mapping[line[9]]
            if curr_gene != other_gene:
                to_exclude.append(line[3])

    filtered_out_name = "{0}_filt.txt".format(outname[:-4])
    with open(filtered_out_name, "w") as filt_f:
        for name in to_exclude:
            filt_f.write("{0}\n".format(name))

    final_out_name = "{0}_distrib.bed".format(outname[:-4])

    distances = co.peak_pos_in_exon(outname, reads_file, from_end = True, reads_mode = True)[0]
    write_dist_mat(distances, start_coord + end_coord, final_out_name, None, "{0}_names.txt".format(final_out_name[:-4]), None)
コード例 #21
0
def main():
    description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls."
    args = parse_arguments(description, [
        "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file",
        "SFS_file", "trial_file", "trials", "shuffle"
    ],
                           ints=[6],
                           flags=[7])
    hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle

    true_hits = rw.read_pos(hit_file)
    true_controls = rw.read_pos(control_file)

    #to store the original data in case this is a negative control and you will be shuffling
    #hits and controls
    original_INSIGHT_hit_file = INSIGHT_hit_file
    original_INSIGHT_control_file = INSIGHT_control_file

    print(hit_file)

    with open(trial_file, "w") as file:
        file.write(
            "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n"
        )
        for trial in range(trials):
            to_write = "{0}\t".format(trial)

            #if this is a negative control
            if shuffle:
                INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial),
                                          original_INSIGHT_hit_file)
                INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial),
                                              original_INSIGHT_control_file)
                temp_hits_file = "temp_data/temp_hits{0}.txt".format(
                    random.random())
                temp_controls_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                #shuffle hits and controls
                temp_hits, temp_controls = shuffle_dictionaries(
                    true_hits, true_controls)
                rw.write_pos(temp_hits, temp_hits_file)
                rw.write_pos(temp_controls, temp_controls_file)
                SFS_file = "temp_data/temp_SFS_file{0}.txt".format(
                    random.random())
                #generate an ISNIGHT input file that you could then use for the manual analysis
                run_process([
                    "python3", "mDFEest_input.py", temp_hits_file,
                    temp_controls_file,
                    "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt",
                    216, SFS_file
                ])
                remove_file(temp_hits_file)
                remove_file(temp_controls_file)

            hit_data = get_data(INSIGHT_hit_file)
            control_data = get_data(INSIGHT_control_file)

            poly_ratio_diff = get_chisq_site_freq(hit_data, control_data)
            to_write = to_write + "{0}\t".format(poly_ratio_diff)

            temp, median_diff = get_mean_freq(SFS_file)
            to_write = to_write + "{0}\n".format(median_diff)

            if shuffle:
                remove_file(SFS_file)

            file.write(to_write)
コード例 #22
0
ファイル: RBP_motifs.py プロジェクト: rosinaSav/RBP_motifs
def main():
    '''
    Read in a series of input files on the sequence specificities of RBPs,
    filter the data and write a set of motifs for each RBP.
    Arguments (see Methods for further details on the input data files):
    upper_threshold, lower_threshold: the longest and shortest a motif is allowed to be, respectively
    RBPDB_experiments: path to RBPDB experiments file
    RBPDB proteins: path to RBPDB proteins file
    RBPDB_PWMs: path to file containing RBPDB PWM identifier to RBP mapping
    pwm_dir: path to directory containing RBPDB PWMs
    RBPmap_PSSMs: path to directory containing RBPmap PSSMs
    SFmap_proteins: path to file containing motifs from SFmap
    RNAcompete_information: path to summary file from CIS-BP RNA
    RNAcompete_PWMs: path to directory containing CIS-BP RNA PWMs
    final_motifs_file_name: name for output file
    plot_name: file for plot displaying the distribution of motif set sizes
    species: the species for which motifs are required
    '''

    description = "Compile a set of motifs putatively recognized by RNA-binding proteins."
    args = parse_arguments(description, ["upper_threshold", "lower_threshold", "RBPDB_experiments", "RBPDB_proteins", "RBPDB_PWMs", "pwm_dir", "RBPmap_PSSMs", "SFmap_proteins", "RNAcompete_information", "RNAcompete_PWMs", "final_motifs_file_name", "plot_name", "species"], ints = [0, 1])
    [upper_threshold, lower_threshold, RBPDB_experiments, RBPDB_proteins, RBPDB_PWMs, pwm_dir, RBPmap_PSSMs, SFmap_proteins, RNAcompete_information, RNAcompete_PWMs, final_motifs_file_name, plot_name, species] = [args.upper_threshold, args.lower_threshold, args.RBPDB_experiments, args.RBPDB_proteins, args.RBPDB_PWMs, args.pwm_dir, args.RBPmap_PSSMs, args.SFmap_proteins, args.RNAcompete_information, args.RNAcompete_PWMs, args.final_motifs_file_name, args.plot_name, args.species]

    db_fields = rw.read_many_fields(RBPDB_experiments, ",")
    db_fields = db_fields[1:]
    print("There are {0} RBPDB experiments.".format(len(db_fields)))
    db_proteins = rw.read_many_fields(RBPDB_proteins, ",")
    #species is "H**o sapiens" or "Mus musculus"
    db_proteins = [i for i in db_proteins if i[6] == species]
    protein_names = sorted(list(set([i[4] for i in db_proteins])))
    db_fields = [i for i in db_fields if i[3] in protein_names]
    protein_number_before = (len(list(set([i[3] for i in db_fields]))))
    print("{0} were performed in {1}.\n".format(len(db_fields), species))
    db_fields = [i for i in db_fields if i[2] != ""]
    protein_number_after = (len(list(set([i[3] for i in db_fields]))))
    db_fields = [[i[3], "RBPDB", i[0], i[1], i[2]] for i in db_fields]
    print("After removing experiments with no reported motif, {0} proteins remain of the initial {1}.\n".format(protein_number_after, protein_number_before))

    bases = np.array(["A", "C", "G", "U"])
    db_pwm_list = rw.read_many_fields(RBPDB_PWMs, "\t")

    for i in db_pwm_list:
        if i[1] in protein_names:
            current_file_name = "{0}/{1}.pwm".format(pwm_dir, i[0])
            current_PWM = rw.read_many_fields(current_file_name, delimiter = " ")
            for j in range(len(current_PWM)):
                current_PWM[j] = [float(k) for k in current_PWM[j] if k != ""]
            consensus = nc.consensus_from_PWM(current_PWM, bases, 0)
            PMID = i[0].split("_")
            PMID = PMID[1]
            new_record = [i[1], "RBPDB_PWM", PMID, "SELEX", consensus]
            db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding additional sequences from SELEX PWMs (RBPDB), there are {0} proteins.\n".format(protein_number_after))

    if species == "Mus musculus":
        RBPmap_proteins = rw.read_many_fields("RBP/RBPmap_proteins.csv", ",")
        RBPmap_proteins = list_to_dict(RBPmap_proteins, 0, 1)
        RNAc_source = [i for i in RBPmap_proteins if "23846655" in RBPmap_proteins[i]]
    else:
        RNAc_source = []

    for file_name in os.listdir(RBPmap_PSSMs):
        #RBPmap and SFmap don't distinguish between human and mouse motifs
        if "human" in file_name:
            file_name_split = file_name.split("_")
            protein_name = file_name_split[0]
            if protein_name not in RNAc_source:
                initial_pssm = rw.read_many_fields(os.path.join(RBPmap_PSSMs, file_name), delimiter = "\t")
                current_pssm = initial_pssm[1:]
                current_pssm = [i[1:] for i in current_pssm]
                for i in range(len(current_pssm)):
                    current_pssm[i] = [float(j) for j in current_pssm[i]]
                consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True)
                protein_name = list(protein_name)
                if protein_name[:4] == ["S", "R", "S", "F"]:
                    protein_name[:4] = ["S", "F", "R", "S"]
                protein_name = "".join(protein_name)
                new_record = [protein_name, "RBPmap_PWM", "NULL", "various", consensus]
                db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding additional sequences from RBPmap PSSMs, there are {0} proteins.\n".format(protein_number_after))

    SFmap_data = rw.read_many_fields(SFmap_proteins, delimiter = ",")

    for i in SFmap_data:
        if "," in i[1]:
            temp_split = i[1].split(", ")
            temp_split = [j.upper() for j in temp_split]
            i[1] = ";".join(temp_split)
        else:
            i[1] = i[1].upper()
        new_record = [i[0], "SFmap", "NULL", "various", i[1]]
        db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding motifs from SFmap, there are {0} proteins.\n".format(protein_number_after))

    RNAc = rw.read_many_fields(RNAcompete_information, delimiter = "\t")
    RNAc = [i for i in RNAc[1:] if i]
    if species == "H**o sapiens":
        RNAc = [i for i in RNAc if i[3] != "." and i[8] == "D"]
    if species == "Mus musculus":
        RNAc = [i for i in RNAc if i[3] != "."]

    PSSM_folder = RNAcompete_PWMs
    for record in RNAc:
        motif_name = record[3]
        initial_pssm = rw.read_many_fields(os.path.join(PSSM_folder, "{0}.txt".format(motif_name)), delimiter = "\t")
        if initial_pssm == []:
            if record[19] == "21036867":#RBPDB paper
                pass
            else:
                print(record)
        else:    
            current_pssm = initial_pssm[1:]
            current_pssm = [i[1:] for i in current_pssm]
            for i in range(len(current_pssm)):
                current_pssm[i] = [float(j) for j in current_pssm[i]]
            consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True)
            protein_name = record[6]
            new_record = [protein_name, "CIS-BP_RNA_PWM", record[19], record[14], consensus] 
            db_fields.append(new_record)

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After adding motifs from CIS-BP RNA, there are {0} proteins.\n".format(protein_number_after))

    to_delete = []
    for pos, i in enumerate(db_fields):
        if ";" in i[4]:
            if "; " in i[4]:
                temp_split = i[4].split("; ")
            else:
                temp_split = i[4].split(";")
            temp_split = [((j.upper()).lstrip("N")).rstrip("N") for j in temp_split]
            temp_split = [j for j in temp_split if len(j) <= upper_threshold and len(j) >= lower_threshold and "(" not in j]
            if temp_split:
                db_fields[pos][4] = temp_split[0]
                for j in temp_split[1:]:
                    db_fields.append([i[0], i[1], i[2], i[3], j])
            else:
                to_delete.append(pos)
        else:
            i[4] = (((i[4]).upper()).rstrip("N")).lstrip("N")
            if len(i[4]) > upper_threshold or len(i[4]) < lower_threshold or "(" in i[4]:
                to_delete.append(pos)
            else:
                db_fields[pos][4] = i[4]

    db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete]

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After only keeping motifs of length {0}-{1} bp, {2} proteins remain.\n".format(lower_threshold, upper_threshold, protein_number_after))

    protein_names = list(set([i[0] for i in db_fields]))

    if species == "Mus musculus":
        protein_names_file = "RBP/RBP_names_for_checking.txt"
        with open(protein_names_file, "w") as file:
            for name in protein_names:
                file.write("{0}\n".format(name))
        MGI_file = "RBP/MGI_correspondances.txt"
        MGI = rw.read_many_fields(MGI_file, "\t")
        MGI_names_all = [i[0] for i in MGI[1:]]
        found = [i[0] for i in MGI if i[0] == i[3]]
        MGI = {i[0]: i[3] for i in MGI[1:] if i[0] not in found}

    to_delete = []
    for pos, i in enumerate(db_fields):
        if species == "Mus musculus":
            db_fields[pos][0] = "".join([db_fields[pos][0][0].upper(), db_fields[pos][0][1:].lower()])
            #will get rid of Hnrnpcl1, which didn't return anything in the MGI search.
            if db_fields[pos][0] not in MGI_names_all:
                to_delete.append(pos)
            else:
                if db_fields[pos][0] not in found:
                    db_fields[pos][0] = MGI[db_fields[pos][0]]
        elif species == "H**o sapiens":
            if i[0] == "A2BP1" or i[0] == "FOX1":
                db_fields[pos][0] = "RBFOX1"
            elif i[0] == "SFRS13A":
                db_fields[pos][0] = "SRSF10"
            elif i[0][:6] == "BRUNOL":
                db_fields[pos][0] = "CELF{0}".format(i[0][-1])
            elif i[0] == "CUGBP":
                db_fields[pos][0] = "CELF1"
            elif i[0] == "Fusip1":
                db_fields[pos][0] = "SRSF10"
            elif i[0][:4] == "SFRS":
                db_fields[pos][0] = "SRSF{0}".format(i[0][4:])
            elif i[0] == "HuR":
                db_fields[pos][0] = "ELAVL1"
            elif i[0] == "MBNL":
                db_fields[pos][0] = "MBNL1"
            elif i[0] == "PTB":
                db_fields[pos][0] = "PTBP1"
            elif i[0] == "QK1":
                db_fields[pos][0] = "QKI"
            elif i[0] == "RBM9":
                db_fields[pos][0] = "RBFOX2"
            elif i[0] == "STAR-PAP":
                db_fields[pos][0] = "TUT1"
            elif i[0] == "YB-1":
                db_fields[pos][0] = "YBX1"
            elif i[0] == "hnRNPK":
                db_fields[pos][0] = "HNRNPK"
            elif i[0] == "hnRNPLL" or i[0] == "HNRPLL":
                db_fields[pos][0] = "HNRNPLL"

    db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete]

    protein_names = list(set([i[0] for i in db_fields]))

    protein_number_after = (len(list(set([i[0] for i in db_fields]))))
    print("After cleaning up protein IDs, {0} proteins remain.\n".format(protein_number_after))
            
    protein_dict = {}
    for i in db_fields:
        if i[0] not in protein_dict.keys():
            protein_dict[i[0]] = [i]
        else:
            protein_dict[i[0]].append(i)

    if species == "H**o sapeins":
        del protein_dict["PPIE"]
        del protein_dict["MIR1236"]
        del protein_dict["PABPC4"]
        print("After removing PPIE, PABPC4 and MIR1236, {0} proteins remain.\n".format(len(protein_dict)))
    elif species == "Mus musculus":
        del protein_dict["Pabpc4"]
        print("After removing Pabpc4, {0} proteins remain.\n".format(len(protein_dict)))

    for i in protein_dict:
        if i == "ELAVL1":
            protein_dict[i].append(['ELAVL1', 'synthetic', 'synthetic', 'synthetic', 'UUWGDUU'])
        elif i == "ELAVL2":
            protein_dict[i].append(['ELAVL2', 'synthetic', 'synthetic', 'synthetic', 'RWUUYAUUUWR'])
        protein_dict[i] = sorted(protein_dict[i], key = lambda x:x[4])
        current_motifs = [j[4] for j in protein_dict[i]]
        to_delete = []
        for j in range(1, len(current_motifs)):
            if current_motifs[j] == current_motifs[j-1]:
                for k in range(1, 4):
                    protein_dict[i][j][k] = ",".join([protein_dict[i][j][k], protein_dict[i][j - 1][k]])
                to_delete.append(j - 1)
        protein_dict[i] = [protein_dict[i][j] for j in range(len(protein_dict[i])) if j not in to_delete]

    for i in protein_dict:
        protein_dict[i] = [[j[0], j[4], j[1], j[2], j[3]] for j in protein_dict[i]] 

    print("\n")
    print("Writing motifs to {0}.\n".format(final_motifs_file_name))

    motif_numbers = []
    with open(final_motifs_file_name, "w") as final_motifs_file:
        for i in sorted(list(protein_dict.keys())):
            final_motifs_file.write(">{0}\n".format(i))
            current_motifs = [j[1] for j in protein_dict[i]]
            DNA_motifs = [nc.DNA_RNA_conversion(j) for j in current_motifs]
            unravelled_motifs = [nc.unravel_consensus(j) for j in DNA_motifs]
            unravelled_motifs = flatten(unravelled_motifs)
            unravelled_motifs = list(set(unravelled_motifs))
            print("Writing {0} motifs for {1}.".format(len(unravelled_motifs), i))
            motif_numbers.append(len(unravelled_motifs))
            unravelled_motifs = "|".join(unravelled_motifs)
            final_motifs_file.write("{0}\n".format(unravelled_motifs))

    plt.figure(1)
    plotting.histogram(motif_numbers, 50, x_lab = "Motif number", y_lab = "Frequency", title = None)
    plotting.save_and_show([10, 10], 100, plot_name)
コード例 #23
0
def main():
    description = "Run mDFEest."
    args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10])
    hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change

    #if you want to generate a new input file rather than reading in an existing one
    if new_input:
        remove_file("../multidfe/{0}".format(input_file.split("/")[-1]))
        arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file]
        if shuffle:
            arguments.append("--shuffle")
        run_process(arguments)
    
    if seed == "None":
        seed = None
    else:
        seed = float(seed)

    #if you want to run it only with a population size change model,
    #rather than both a model assuming population size change and a fixed population
    #size model
    if fix_pop_change:
        pop_change = [True]
    else:
        pop_change = [False, True]

    if fixed_model == "None":
        #all possible models
        allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"]
        spike_range = [2, 6]
    else:
        #only the spcified model
        allowed = [fixed_model]
        #only two-spike models
        spike_range = [2, 3]

    with open(output_file, "w") as file:
        file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n")
        for change_mode in pop_change:
    
            print("\nPopulation expansion: {0}.".format(str(change_mode)))

            if "lognormal" in allowed:
                print("lognormal model:")
                output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "gamma" in allowed:
                print("gamma model:")
                output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "beta" in allowed:
                print("beta model:")
                output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            for spike_number in range(spike_range[0], spike_range[1]):

                if "spikes" in allowed:
                    print("{0}-spikes model:".format(spike_number))
                    output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

                if "steps" in allowed:
                    print("{0}-steps model:".format(spike_number))
                    output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

            if "fixed six spikes" in allowed:
                print("fixed six spikes model:")
                output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)
コード例 #24
0
def main():
    description = "Aggregate various statistics on the splicing events you're studying."
    args = hk.parse_arguments(description, [
        "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file",
        "genome_file", "output_file"
    ])
    gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file

    CDSs = rw.read_gtf(gtf, "CDS", gene=False)
    exons = rw.read_gtf(gtf, "exon", gene=False)
    exon_starts = rw.read_many_fields(exon_start_coords,
                                      skip_header=False,
                                      delimiter="\t")
    exon_starts = {i[3]: i for i in exon_starts}
    out_array = np.array(sorted(exon_starts.keys()), dtype="str")
    out_array.shape = (len(exon_starts.keys()), 1)
    out_array = np.vstack((["junction"], out_array))

    #1. exon size
    curr_dict = co.get_lengths(CDSs, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_size")
    print("Exon size done.")

    #2. exon number
    curr_dict = co.get_exon_number(exons, exon_starts.keys())
    out_array = add_to_array(out_array, curr_dict, "exon_number")
    print("Exon number done.")

    #3. exon rank (from start and end)
    exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts)
    out_array = add_to_array(out_array, exon_rank_start,
                             "exon_rank_from_start")
    out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end")
    print("Exon rank done.")

    #4. upstream intron size
    curr_dict = co.get_upstream_intron_size(exons, exon_rank_start)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_size")
    curr_dict = co.get_upstream_intron_size(exons,
                                            exon_rank_start,
                                            downstream=True)
    out_array = add_to_array(out_array, curr_dict, "downstream_intron_size")
    print("Intron size done.")

    if truncated_exons_file != "None":

        #5. Pol II density per transcript
        dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4])
        dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file,
                                                      polII_bed,
                                                      dens_per_trans_file,
                                                      out_array[1:, 0])
        out_array = add_to_array(out_array, dens_per_trans_junctions,
                                 "polII_dens_per_trans")
        print("Pol II density done.")

    #6. exon GC4 and GC content
    genome = Fasta(genome_file)
    curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC4")
    curr_dict = get_exon_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "exon_GC")
    print("Exon GC done.")

    #7. upstream intron GC content
    curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome)
    out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC")
    print("Intron GC done.")

    #8. splice site strength
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=True,
                                   five=False,
                                   exonic=3,
                                   intronic=20)
    out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength")
    curr_dict = nc.get_ss_strength(exons,
                                   genome_file,
                                   upstream=False,
                                   five=True,
                                   exonic=3,
                                   intronic=6)
    out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength")
    print("Splice site strength done.")

    with open(output_file, "w") as file:
        for line in range(0, out_array.shape[0]):
            line = out_array[line, :]
            line = "\t".join([str(i) for i in line])
            file.write(line)
            file.write("\n")
コード例 #25
0
def main():
    description = "Run INSIGHT on a set of sequences and a set of sites."
    args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "suffix", "dataset", "output_folder", "freq_threshold", "n", "hit_file", "control_file", "SNP_file_name_prefix", "CDS_SNP_file_name_prefix", "MSA_file_name_prefix", "trial_file", "trials", "hit_degen_file", "control_degen_file", "hit_reduce", "control_reduce", "new_SNPs", "new_MSA", "shuffle", "nonsyn_hits", "remove_GT", "big_tree"], floats = [7, 18, 19], ints = [8, 15], flags = [20, 21, 22, 23, 24, 25])
    fasta, genome, features_file, families_file, suffix, dataset, general_output_folder, freq_threshold, n, hit_file, control_file, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, trial_file, trials, hit_degen_file, control_degen_file, hit_reduce, control_reduce, new_SNPs, new_MSA, shuffle, nonsyn_hits, remove_GT, big_tree = args.fasta, args.genome, args.features_file, args.families_file, args.suffix, args.dataset, args.output_folder, args.freq_threshold, args.n, args.hit_file, args.control_file, args.SNP_file_name_prefix, args.CDS_SNP_file_name_prefix, args.MSA_file_name_prefix, args.trial_file, args.trials, args.hit_degen_file, args.control_degen_file, args.hit_reduce, args.control_reduce, args.new_SNPs, args.new_MSA, args.shuffle, args.nonsyn_hits, args.remove_GT, args.big_tree
    output_folder = "{0}/{1}_{2}".format(general_output_folder, dataset, suffix)

    names, seqs = rw.read_fasta(fasta)

    #prepare feature set and family information
    fs = Feature_Set(features_file, genome)
    fs.set_dataset(dataset)
    if families_file == "None":
        conservation.find_families(fasta, "general/{0}".format(dataset))
        families_file = "general/{0}_families.txt".format(dataset)
    families = rw.read_families(families_file)
    fs.add_families(families)

    make_dir(output_folder)

    general_folder = "DFE/for_everybody"
    make_dir(general_folder)
    if MSA_file_name_prefix == "None":
        MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset)

    #read in degeneracy information
    if hit_degen_file != "None":
        degen_hits = parse_degen(hit_degen_file)
        degen_controls = parse_degen(control_degen_file)
    else:
        degen_hits = None
        degen_controls = None

    #get relevant genome features
    transcripts = fs.get_transcripts()
    CDSs = fs.get_CDS()
    lengths = fs.get_lengths(CDSs, CDS = True)
    #filter out sex chromosomes from the analysis
    sex_chromosomes = ["X", "Y"]
    chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes}
    chroms = list(set(list(chrom_dict.values())))

    clean_names = ["h**o", "pan", "pongo", "macaca"]

    #if you're running several trials
    #if just one, it'll still make a single trial file
    if trial_file == "None":
        trial_file = "{0}_{1}_{2}.txt".format(trial_file, suffix, trials)
        

    with open(trial_file, "w") as o_file:
        print(suffix)
        #output file header
        o_file.write("rho\teta\tgamma\tDp\tPw\talpha\ttau\trhose\tetase\tgammase\trholl\tetall\tgammall\n")
        for trial in range(trials):
            print("==========TRIAL {0}==========\n".format(trial))


            #get INSIGHT input data as a string based on divergence and SNP data
            hit_output, neutral_output, chroms_to_keep, hit_counts, control_counts = get_MSA(chroms, chrom_dict, control_file, hit_file, CDSs, lengths, names, seqs, clean_names, freq_threshold, dataset, suffix, genome, output_folder, general_folder, n, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, new_SNPs, new_MSA, shuffle, remove_GT, big_tree, hit_reduce = hit_reduce, control_reduce = control_reduce,  degen_hits = degen_hits, degen_controls = degen_controls)

            print("Writing output files...")
            neutral_output_file = "{0}/{1}_{2}_{3}_neutral_input.txt".format(output_folder, dataset, suffix, trial)
            hit_output_file = "{0}/{1}_{2}_{3}_hit_input.txt".format(output_folder, dataset, suffix, trial)
            write_output_file(neutral_output_file, neutral_output, n)
            write_output_file(hit_output_file, hit_output, n)

            print("Running INSIGHT...")
            conservation.INSIGHT(neutral_output_file, hit_output_file, freq_threshold, "../Software/INSIGHT", "{0}_{1}".format(dataset, suffix))

            print("Counting positions on chromosomes...")
            with open("{0}/{1}_{2}_pos_per_chrom.csv".format(output_folder, dataset, suffix), "w") as file:
                file.write("chrom\thits\tcontrols\n")
                for chrom in sorted(chroms_to_keep):
                    file.write("{0}\t{1}\t{2}\n".format(chrom, hit_counts[chrom], control_counts[chrom]))

            INSIGHT_output = "../Software/INSIGHT/{0}_{1}.ins.log".format(dataset, suffix)
            #parse the INSIGHT output and do simple significance testing
            try:
                parsed_output = parse_INSIGHT_output(INSIGHT_output)
                estimates = parsed_output["estimates"]
                SE = parsed_output["SEs"]
                lls = parsed_output["chi_sq"]

                print("\n")
                print("Chisq statistics: {0}".format(" ".join([str(i) for i in lls])))
                rho_pL = scipy.stats.chi2.sf(lls[0], 3)
                print("pL(rho): {0}".format(rho_pL))
                eta_pL = scipy.stats.chi2.sf(lls[1], 1)
                print("pL(eta): {0}".format(eta_pL))
                gamma_pL = scipy.stats.chi2.sf(lls[2], 1)
                print("pL(gamma): {0}".format(gamma_pL))
                
                lls = "\t".join([str(i) for i in lls])
                estimates = "\t".join(estimates)
                SE = "\t".join(SE)
                o_file.write(estimates)
                o_file.write("\t")
                o_file.write(SE)
                o_file.write("\t")
                o_file.write(lls)
                o_file.write("\n")
            #skip trials where INSIGHT failed to produce a full output
            except IndexError:
                print("Skipping...")
                pass
コード例 #26
0
def main():
    description = "Record splicing distance."
    args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7])
    input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal

    if outsuffix == "None":
        outsuffix = ""

    bare_input_path = input_file.split("/")[-1]
    bed = "{0}.bed".format(input_file[:-4])
    # hk.convert2bed(input_file, bed)

    # get descriptive stats of the reads
    length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4])
    write_read_lengths(bed, length_file)

    # read in CDS coordinates
    exons = rw.read_gtf(gtf, "CDS", gene=False)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    exons = {i: exons[i] for i in exons if i in trans_active_genes}
    terminal_suff = "_with_terminal"
    if not leave_terminal:
        # remove last exons
        exons = {i: exons[i][:-1] for i in exons}
        terminal_suff = ""
    # prepare exon-exon junctions
    exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff)
    all_junctions = co.extract_3ss(exons, exon_junctions_file)

    out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True)
    out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True)
    intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff)
    write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True)
    out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True)
    out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True)
    out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si, exons, add_chr=True)
    out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True)
    # check which junctions are associated with a splicing intermediate read
    snr_bed = "{0}_snr.bed".format(bed[:-4])
    co.snr_bed(bed, snr_bed)
    si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed)
    si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed)

    # filter out reads that don't overlap exon-exon junctions
    exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    co.intersect_bed(bed, exon_junctions_file, write_both=True,
                     output_file=exon_junction_bed,
                  force_strand=True, no_dups=False)

    spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    sr_distances = {}
    ur_distances = {}
    found_count = 0
    file_size = hk.line_count(exon_junction_bed)

    # will store all the intron names for which there are
    # either spliced or unspliced reads
    valid_junctions = []
    with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile:
        for pos, line in enumerate(file):

            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, file_size))
                print("Found {0} spliced reads.".format(found_count))
                print("\n")

            line = line.split("\t")

            # reads that end at the last nucleotide of an exon
            intermediate_read = NGS.check_intermediate_read(line, exons)
            intron_name = line[20]

            if not intermediate_read:

                # check that it ends within the exon just downstream of
                # the 3' ss that is being analyzed

                in_dwns_exon = NGS.check_position_in_exon(line, exons)

                if in_dwns_exon:

                    # 'spliced', 'unspliced' or 'None' (=can't analyze)
                    read_type = NGS.analyze_cigar(line, overhang = 5)

                    if read_type:
                        if intron_name not in valid_junctions:
                            valid_junctions.append(intron_name)
                        splice_dist = NGS.get_splice_dist(line)
                        if read_type == "S":
                            sfile.write("\t".join([str(i) for i in line]))
                            found_count = found_count + 1
                            sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist)
                        else:
                            ufile.write("\t".join([str(i) for i in line]))
                            ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist)

    print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1)))

    # for each valid junction, calculate the length of the exonic sequence
    # afterwards, so that you wouldn't consider intronic sequence in the distance
    # matrix
    lengths_dict = co.get_lengths(exons, valid_junctions)

    write_dist_mat(sr_distances, window_size,
                   "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))

    write_dist_mat(ur_distances, window_size,
                   "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
コード例 #27
0
def main():
    description = "Calculate the normalized dS of a dataset."
    args = parse_arguments(description, [
        "dataset", "feature_set", "genome", "families_file", "fasta",
        "hit_file_prefix", "motifs_file", "correspondances", "alignments",
        "suffix", "trials", "trial_file", "old_trial_file", "region_fasta",
        "old_motif_format", "nonsense", "no_families", "newest_only",
        "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen",
        "regions"
    ],
                           ints=[10],
                           flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
    dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions

    n_sim = 1000

    print(suffix)

    #set up feature set and families
    fs = Feature_Set(feature_set, genome)
    fs.set_dataset(dataset)
    if no_families:
        picked = fs.names
    else:
        families = rw.read_families(families_file)
        fs.add_families(families)
        picked = fs.pick_random_members()

    hit_phylip = "temp_data/temp_{0}.phy".format(random.random())
    control_phylip = "temp_data/temp_control_{0}.phy".format(random.random())

    if not nonsense:
        if old_motif_format:
            motifs = rw.read_names(motifs_file)[1:]
        else:
            motifs = rw.read_motifs(motifs_file)
            if top_set_only:
                summary_data = rw.read_many_fields(
                    "RBP/RBP_hg38_introncontaining_new.txt", "\t")
                summary_dict = list_to_dict(summary_data, 0, 4, floatify=True)
                motifs = {
                    RBP: motifs[RBP]
                    for RBP in motifs if (summary_dict[RBP] < 0.1)
                }
            motifs = list(set(flatten(motifs.values())))

    if reverse_site_numbers:
        site_number_suffix = "_reversed_site_numbers_"
    else:
        site_number_suffix = ""

    if matched:
        matched_suff = "_matched"
    else:
        matched_suff = ""

    if degen:
        degen_suff = "_degen.txt"
    else:
        degen_suff = ""

    with open(trial_file, "w") as trial_out:

        trial_out.write(
            "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n"
        )
        if old_trial_file != "None":
            old_trials = rw.read_many_fields(old_trial_file, "\t")
            old_trials = old_trials[1:]
            old_trials = [i[1:5] for i in old_trials]
            seed_kmers = 1
        else:
            seed_kmers = None

        #you can do this for loads of trials
        #useful as a negative control if you're generating a new set of nonsense motifs
        #each time
        for trial in range(trials):

            print(trial)

            trial_output = [trial]

            #if you're meant to generate a load of nonsense motifs rather than using real motifs
            if nonsense:
                if old_trial_file != "None":
                    #read in the intended nucleotide composition of the nonsense
                    #motifs from file
                    scaled_comp = [float(i) for i in old_trials[trial]]
                else:
                    #pick nonsense motifs nucleotide composition by chance
                    comp = [random.random() for i in range(4)]
                    scaled_comp = [i / np.sum(comp) for i in comp]
                comp_dict = {
                    i: scaled_comp[pos]
                    for pos, i in enumerate(nc._canon_bases_)
                }
                motifs, obtained_dict = nc.kmers_from_nc(6,
                                                         50,
                                                         comp_dict=comp_dict,
                                                         return_freqs=True,
                                                         seed=seed_kmers)
                motifs = ["motifs"] + motifs
                trial_output = trial_output + [
                    obtained_dict[i] for i in nc._canon_bases_
                ]
                temp_motifs_file = "temp_data/temp_motifs.txt"
                rw.write_names(motifs, temp_motifs_file)

            print(
                "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL==="
            )
            hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            if nonsense:
                hit_file = "temp_data/temp_hits{0}.txt".format(random.random())
                control_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                error_file = "temp_data/temp_error{0}.txt".format(
                    random.random())
                get_control_sites(
                    fasta, genome, feature_set, families_file, dataset,
                    temp_motifs_file, hit_file, control_file, error_file,
                    "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt",
                    [
                        "--leave_CG", "--context", "--remove_ancestral_CpG",
                        "--macaque_anc", "--big_tree", "--replacement_control"
                    ])
            get_density(fasta, motifs, fs)
            norm_ds = get_new_method_results(hit_file,
                                             control_file,
                                             hit_phylip,
                                             control_phylip,
                                             correspondances,
                                             alignments,
                                             fasta,
                                             regions=regions,
                                             global_fasta=region_fasta,
                                             fs=fs)
            trial_output.append(norm_ds)
            if calc_p:
                p, low_CI, high_CI, sd, Z = get_sim_p(
                    norm_ds,
                    hit_file,
                    control_file,
                    correspondances,
                    alignments,
                    fasta,
                    n_sim,
                    reverse_site_numbers=reverse_site_numbers,
                    sim_ds_file=
                    "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}"
                    .format(hit_file_prefix, site_number_suffix, matched_suff,
                            degen_suff))

            trial_output = "\t".join([str(i) for i in trial_output])
            trial_out.write(trial_output)
            trial_out.write("\n")

            remove_file(hit_phylip)
コード例 #28
0
ファイル: write_TSS.py プロジェクト: rosinaSav/dNETseq_code
def main():
    description = "Write out a BED file with the region surrounding the TSS for a set of genes."
    args = hk.parse_arguments(
        description,
        ["genes_file", "gtf", "outfile", "start_coord", "end_coord"],
        ints=[3, 4])
    genes_file, gtf, outfile, start_coord, end_coord = args.genes_file, args.gtf, args.outfile, args.start_coord, args.end_coord

    need_to_seek = False
    if genes_file[-3:] != "bed":
        # it means that you got a list of gene symbols rather than a
        # BED file with coordinates
        need_to_seek = True
        transcript_file = "{0}_transcripts.gtf".format(gtf[:-4])
        co.get_transcripts(gtf,
                           transcript_file,
                           add_chr=False,
                           with_detail=False,
                           output_gtf=True)

    with open(genes_file) as gf, open(outfile, "w") as of:
        reader = csv.reader(gf, delimiter="\t")
        writer = csv.writer(of, delimiter="\t")
        for line in reader:
            if need_to_seek:
                gene = line[0]
                print(gene)
                possibilities = hk.run_process([
                    "grep", "gene_symbol \"\"{0}\"\"".format(gene),
                    transcript_file
                ])
                possibilities = [
                    i.split("\t") for i in possibilities.split("\n")[:-1]
                ]
                chrom = "chr{0}".format(possibilities[0][0])
                strand = possibilities[0][6]
                starts = [i[3] for i in possibilities]
                ends = [i[4] for i in possibilities]
                if strand == "+":
                    counts = Counter(starts)
                    start = int(counts.most_common()[0][0])
                    end = ends[starts.index(str(start))]
                    length = int(end) - start
                    new_end_coord = min(length, end_coord)
                    new_start = str(start - start_coord - 1)
                    new_end = str(start + (new_end_coord - 1))
                elif strand == "-":
                    counts = Counter(ends)
                    end = int(counts.most_common()[0][0])
                    start = starts[ends.index(str(end))]
                    length = end - int(start)
                    new_end_coord = min(length, end_coord)
                    new_start = str(end - new_end_coord)
                    new_end = str(end + start_coord)
                new_line = [chrom, new_start, new_end, gene, ".", strand]
                writer.writerow(new_line)
            else:
                if line[0] != "chrom":
                    length = int(line[2]) - int(line[1])
                    curr_end_coord = min(length, end_coord)
                    if line[5] == "+":
                        new_start = str(int(line[1]) - start_coord)
                        new_end = str(int(line[1]) + curr_end_coord)
                    elif line[5] == "-":
                        new_start = str(int(line[2]) - curr_end_coord)
                        new_end = str(int(line[2]) + start_coord)
                    else:
                        raise Exception("Invalid strand!")
                    new_line = line.copy()
                    new_line[1] = new_start
                    new_line[2] = new_end
                    # to make it a BED6
                    new_line = new_line[:-2]
                    writer.writerow(new_line)
コード例 #29
0
def main():
        description = "Pick roughly nucleotide-matched control sites for a set of motif hits."
        args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "dataset", "motifs_file", "run_number", "hit_file", "niter", "stepsize", "control_file", "error_file", "MSA_file_name_prefix", "anc_CG_file_name", "high_CG_file_name", "exclude_file", "brute_mapping", "verbose", "old_motif_format", "nonsyn_hits", "top_set_only", "remove_GT", "leave_CG", "remove_ancestral_CpG", "replacement_control", "macaque_anc", "remove_macaque_CpG", "big_tree", "pseudoCG", "comprehensive", "context", "prone_sites", "CG_gene_filter", "match_size", "raw", "regions"], ints = [6, 8, 9], flags = [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])
        fasta, genome, features_file, families_file, dataset, motifs_file, run_number, hit_file, niter, stepsize, control_file, error_file, MSA_file_name_prefix, anc_CG_file_name, high_CG_file_name, exclude_file, brute_mapping, verbose, old_motif_format, nonsyn_hits, top_set_only, remove_GT, leave_CG, remove_ancestral_CpG, replacement_control, macaque_anc, remove_macaque_CpG, big_tree, pseudoCG, comprehensive, context, prone_sites, CG_gene_filter, match_size, raw, regions = args.fasta, args.genome, args.features_file, args.families_file, args.dataset, args.motifs_file, args.run_number, args.hit_file, args.niter, args.stepsize, args.control_file, args.error_file, args.MSA_file_name_prefix, args.anc_CG_file_name, args.high_CG_file_name, args.exclude_file, args.brute_mapping, args.verbose, args.old_motif_format, args.nonsyn_hits, args.top_set_only, args.remove_GT, args.leave_CG, args.remove_ancestral_CpG, args.replacement_control, args.macaque_anc, args.remove_macaque_CpG, args.big_tree, args.pseudoCG, args.comprehensive, args.context, args.prone_sites, args.CG_gene_filter, args.match_size, args.raw, args.regions

        #argparse can't do booleans
        if anc_CG_file_name == "None":
            anc_CG_file_name = None

        #I store motif data in one of two formats
        if old_motif_format:
            motifs = rw.read_names(motifs_file)[1:]
        else:
            motifs = rw.read_motifs(motifs_file)
            #if you're doing RBP motifs and only want motifs that were found to be enriched in Savisaar and Hurst 2017
            if top_set_only:
                summary_data = rw.read_many_fields("RBP/RBP_hg38_introncontaining_new.txt", "\t")

                summary_dict = list_to_dict(summary_data, 0, 4, floatify = True)

                motifs = {RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1)}
            motifs = list(set(flatten(list(motifs.values()))))

        #create an instance of a Feature_Set object and associate a structure of paralogous families to it, unless if you've said to ignore that (used when analyzing exon flanks/cores)
        fs = Feature_Set(features_file, genome)
        fs.set_dataset(dataset)
        if families_file == "None":
            conservation.find_families(fasta, "general/{0}".format(dataset))
            families_file = "general/{0}_families.txt".format(dataset)

        if families_file != "ignore":
            families = rw.read_families(families_file)
            fs.add_families(families)

        general_folder = "DFE/for_everybody"
        make_dir(general_folder)
        #if you've already retrieved MSAs from ensembl
        if MSA_file_name_prefix == "None":
            MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset)

        #admin
        transcripts = fs.get_transcripts()
        CDSs = fs.get_CDS()
        lengths = fs.get_lengths(CDSs, CDS = True)
        #only consider genes that are not on the sex chromosomes
        sex_chromosomes = ["X", "Y"]
        chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes}
        chroms = list(set(list(chrom_dict.values())))

        #U2S is a dinucleotide-based substitution model, JC69 is mononucleotide-based
        if context:
            subst_model = "U2S"
        else:
            subst_model = "JC69"

        #names used in the MSA (there's a character restriction in the phylip files so you can't use the full name)
        clean_names = ["h**o", "pan", "pongo", "macaca"]
        phylip_data = {"homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []}
        if big_tree:
            clean_names = ["calli", "chloro", "gorilla", "h**o", "macaca", "pan", "papio", "pongo"]
            phylip_data = {"gorilla_gorilla": [], "callithrix_jacchus": [], "papio_anubis": [], "chlorocebus_sabaeus": [], "homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []}

        if remove_ancestral_CpG or remove_macaque_CpG or CG_gene_filter:
            anc_CG_dict, macaque_CG_dict = get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = macaque_anc, pseudoCG = pseudoCG, comprehensive = comprehensive, subst_model = subst_model, regions = regions)
                                
        else:
            anc_CG_dict = None
            macaque_CG_dict = None
        
        if replacement_control:
            nc.fit_control_pos_to_hits_replacement(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size, raw = raw, exclude_file = exclude_file)
        else:
            nc.fit_control_pos_to_hits_wrapper(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size)