Example #1
0
def get_MSA_gene_list(coords, coords_file, method, species_set, version,
                      query_species, MSA_file):
    '''
    Given a dictionary of lists of lists of CDS coordinates, retrieve the Compara MSAs.
    '''
    with open(coords_file, "w") as file:
        for trans in coords:
            for exon in coords[trans]:
                phase = exon[1]
                current_coords = exon[0]
                current_coords = [str(i) for i in current_coords]
                current_coords.append(str(phase))
                current_coords = "|".join(current_coords)
                file.write(current_coords)
                file.write("\n")
    remove_file(MSA_file)
    run_process([
        "perl", "MSA_list.pl", method, species_set, version, coords_file,
        query_species, MSA_file
    ])
    with open(MSA_file) as file:
        string = "".join(file)
    string = re.sub("([a-z])\n([a-z])", "\\1\\2", string)
    with open(MSA_file, "w") as file:
        file.write(string)
Example #2
0
def get_pairwise_alignment(coords, coords_file, query_species, other_species,
                           version, output_file):
    '''
    Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara.
    '''
    #write the coordinates to file in a way that can be read by the downstream perl script
    with open(coords_file, "w") as file:
        for feature in coords:
            feature = [str(i) for i in feature]
            feature = "|".join(feature)
            file.write(feature)
            file.write("\n")
    remove_file(output_file)
    #get the alignments from the database
    run_process([
        "perl", "pairwise_from_ensembl.pl", coords_file, query_species,
        other_species, output_file, version
    ])
    #parse them from the output file produced by the perl script
    with open(output_file) as file:
        string = "".join(file)
    string = string.split("***")
    string = [(i.rstrip("\n")).lstrip("\n") for i in string]
    string = [i.split("|||") for i in string]
    string = [[j for j in i if len(j) > 0] for i in string]
    #getting rid of cases where there's multiple GABs
    string = flatten([i for i in string if len(i) == 1])
    #write alignments to a pretty FASTA
    with open(output_file, "w") as file:
        for feature in string:
            temp = feature.split("\n")
            name = temp[0]
            name = name.split("|")
            antisense = False
            if name[6] == "-":
                antisense = True
            try:
                alignments = [temp[2], temp[3]]
                alignments = [i.split(" ") for i in alignments]
                #covert to upper case
                alignments = [([j for j in i if j][1]).upper()
                              for i in alignments]
                #only keep alignments with no ambiguous bases in either sequence
                if "N" not in alignments[0] and "N" not in alignments[1]:
                    #reverse complement, if necessary
                    if antisense:
                        alignments = [
                            str(
                                Seq(i,
                                    IUPAC.unambiguous_dna).reverse_complement(
                                    )) for i in alignments
                        ]
                    file.write(">{0}\n".format("|".join(name)))
                    file.write("|".join(alignments))
                    file.write("\n")
            except IndexError:
                pass
Example #3
0
def run_bedtools(A_file, B_file, force_strand = False, force_opposite_strand = False, write_both = False, chrom = None, overlap = None, sort = False, no_name_check = False, no_dups = True, hit_number = False, output_file = None, intersect = False, bed_path = None, overlap_rec = None, intersect_bam = None, write_zero = None, write_bed = False, exclude = False):
    '''
    See intersect_bed for details.
    '''
    if write_zero:
        write_option = "-wao"
    elif hit_number:
        write_option = "-c"
    elif write_both:
        write_option = "-wo"
    else:
        write_option = "-wa"
    if sort:
        sort_bed(A_file, A_file)
        sort_bed(B_file, B_file)
    bedtools_args = ["bedtools", "intersect", "-a", A_file,"-b", B_file, write_option]
    if intersect:
        del bedtools_args[-1]
    if overlap:
        bedtools_args.extend(["-f", str(overlap)])
    if overlap_rec:
        bedtools_args.append("-r")
    if force_strand:
        bedtools_args.append("-s")
    elif force_opposite_strand:
        bedtools_args.append("-S")
    if no_name_check:
        bedtools_args.append("-nonamecheck")
    if no_dups:
        bedtools_args.append("-u")
    if chrom:
        print("Bedtools cannot be restricted to a single chromosome. Use bedops!")
        raise Exception
    if hit_number and no_dups:
        print("When counting hits, each interval in the first bed file is only reported once by default. Set no_dups to False!")
        raise(Exception)
    if bed_path:
        bedtools_args[0] = "{0}{1}".format(bed_path, bedtools_args[0])
    if exclude:
        bedtools_args.append("-v")
    if intersect_bam:
        if A_file[-4:] != ".bam":
            print("Bam file must be called first")
            raise Exception
        if B_file[-4:] == ".bam":
            print("BAM file must be called first")
            raise Exception
        bedtools_args = ["intersectBed", write_option, "-abam", A_file, "-b", B_file]
        if write_bed:
            bedtools_args.append("-bed")
    try:
        bedtools_output = hk.run_process(bedtools_args, file_for_output = output_file)
    except FileNotFoundError:
        bedtools_args[0] = "intersectBed"
        bedtools_output = hk.run_process(bedtools_args, file_for_output = output_file)
    return(bedtools_output)
Example #4
0
def main():
    description = "Run mDFEest with shuffled input to check the false positive rate."
    args = parse_arguments(description, [
        "hits_file", "controls_file", "output_file", "n_sim", "SNP_file",
        "SNP_number", "hit_reduce", "control_reduce", "const_pop"
    ],
                           ints=[3, 5],
                           floats=[6, 7],
                           flags=[8])
    hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop

    with open(output_file, "w") as file:
        for sim in range(n_sim):
            print(sim)

            temp_hits_file = "temp_data/hits_file{0}.txt".format(
                random.random())
            temp_controls_file = "temp_data/controls_file{0}.txt".format(
                random.random())
            temp_input_file = "temp_data/input_file{0}.txt".format(
                random.random())

            #shuffle hits and controls for negative control
            run_process([
                "python3", "shuffle_hits_and_controls.py", hits_file,
                controls_file, temp_hits_file, temp_controls_file, hit_reduce,
                control_reduce
            ])

            #generate multiDFEest input file
            run_process([
                "python3", "mDFEest_input.py", temp_hits_file,
                temp_controls_file, SNP_file, SNP_number, temp_input_file
            ])

            output = mDFEest("beta", temp_input_file, pop_change=True)

            print(output)
            print(output["Nes_0.0_0.1"])
            print(output["Nes_0.1_1.0"])

            file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                              output["Nes_0.1_1.0"]))

            #if you also want to run with fixed population size
            if const_pop:
                output = mDFEest("beta", temp_input_file, pop_change=False)

                file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                                  output["Nes_0.1_1.0"]))

            file.write("\n")

            remove_file(temp_hits_file)
            remove_file(temp_controls_file)
            remove_file(temp_input_file)
Example #5
0
def sort_bed(input_file_name, output_file_name):
    '''
    Sort a bed file.
    '''
    #This is done via a temp file because that way you can specify the same file as input and output file and thus
    #overwrite the unsorted file with the sorted one.
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    hk.run_process(["sort-bed", input_file_name], file_for_output = temp_file_name)
    hk.run_process(["mv", temp_file_name, output_file_name])
    hk.remove_file(temp_file_name)
Example #6
0
def get_coverage(regions_file, reads_file, output_file_name):
    """
    Given a BED file with regions and a BED/BAM file of reads,
    count how many reads overlap each region and output in new BED file.
    :param regions_file: BED file
    :param reads_file: BED/BAM file
    :param output_file_name: name for output BED file
    :return: None
    """
    hk.run_process(["bedtools", "coverage", "-a", regions_file, "-b",
                    reads_file, "-counts", "-s"], file_for_output=output_file_name)
def main():
    description = "Given a BED file of reads, filter out reads whose " \
                  "3' end maps to the last nucleotide of an intron or" \
                  "the last nucleotide of an exon."
    args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"])
    reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile

    print("Getting intron lariat positions...")

    # read in exon coordinates
    exons = rw.read_gtf(gtf, element="exon", gene=False)
    # make a BED file with the last positions of introns
    intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True)

    # intersect the reads with intron lariat positions
    intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name)
    hk.remove_file(intron_lariat_bed)
    intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at intron lariat positions
    check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file)
    hk.remove_file(intron_lariat_intersect_file_name)

    # write BED with the last positions of exons
    splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4])
    co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True)

    print("Getting splice intermediate positions.")

    # intersect the reads with splice intermediate positions
    splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name)
    hk.remove_file(splice_intermediate_bed)
    SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at the end of the exon
    check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file)
    hk.remove_file(splice_intermediate_intersect_file_name)

    print("Concatenating the two files.")

    # concatenate the IL and SI read files so you could exclude both in one go
    combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4])
    hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file)

    hk.remove_file(SI_reads_file)
    hk.remove_file(intron_lariat_reads_file)

    # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the
    # putative intron lariat reads from the main reads file
    co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile)

    hk.remove_file(combined_file)
def main():

    # Get arguments.
    description = "Check if nucleotide composition at the 5' ends of NET-seq reads is biased."
    args = hk.parse_arguments(
        description,
        ["input_file", "output_file", "genome_fasta", "gtf", "three_prime"],
        flags=[4])
    input_file, output_file, genome_fasta, gtf, three_prime = args.input_file, args.output_file, args.genome_fasta, args.gtf, args.three_prime

    # Convert to .bed, if not already .bed
    if input_file[-3:] != "bed":
        print("Converting input file to .bed...")
        input_file_new_name = "{0}bed".format(input_file[:-3])
        hk.convert2bed(input_file, input_file_new_name)
        input_file = input_file_new_name

    # Make an extended version of each read that extends 5 nt 5prime and 35 nt 3prime
    print("Extending the reads...")
    suffix = ""
    if three_prime:
        suffix = "_three_prime"
    temp_bed = "{0}_extended_for_bias{1}.bed".format(input_file[:-4], suffix)
    co.extend_intervals(input_file,
                        temp_bed,
                        5,
                        35,
                        remove_chr=True,
                        add_chr=False,
                        three_prime=three_prime)

    # Make a FASTA file from the BED file.
    print("Extracting sequences...")
    fasta_name = "{0}fasta".format(temp_bed[:-3])
    hk.run_process([
        "fastaFromBed", "-bed", temp_bed, "-fi", genome_fasta, "-fo",
        fasta_name, "-s"
    ])
    print("Number of lines in FASTA:")
    print(hk.run_process(["wc", "-l", fasta_name]))

    # Store the sequences at -5:+5 and 30:40 in a 2D array
    print("Storing sequences in arrays...")
    occ_mat_true, occ_mat_control = extract_true_and_control_string(
        fasta_name, (0, 10), (30, 40))

    # Make a PPM for either column
    bases = ["A", "T", "C", "G"]
    print("Making PPMs...\n")
    print("TRUE:")
    PPM_wrapper(occ_mat_true, bases, "{0}.true".format(output_file))
    print("CONTROL:")
    PPM_wrapper(occ_mat_control, bases, "{0}.control".format(output_file))
Example #9
0
def merge_bed(in_bed, out_bed, distance):
    """
    Strand-specifically merge a BED files.
    Sorts the input BED file first.
    :param in_bed: input file name
    :param out_bed: output file name
    :param distance: maximum distance between
    two elements that are to be merged
    :return: None
    """
    sorted = hk.run_process(["sortBed", "-i", in_bed])
    # the -c and the -o are so that the strand would end up in the
    # right column
    hk.run_process(["mergeBed", "-s", "-d", distance, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], input_to_pipe = sorted, file_for_output=out_bed)
Example #10
0
def read_gtf(file_name,
             element,
             gene=False,
             filter_parameter=None,
             filter_value=None):
    '''
    Read in all rows that contain coordinates of type _element_ from gtf file.
    Will make a dictionary with either gene or transcript IDs as keys
    (depending on what _gene_ is set to).
    Will also convert start and end coordinates to integers.
    If filter_parameter and filter-value have been specified, only lines with the specified value for
    that parameter will be returned.
    '''
    filter_pattern = ""
    if filter_parameter:
        if not filter_value:
            raise Exception(
                "If filter_parameter has been specified, then filter_value must be too!"
            )
        filter_pattern = "{0} \"{1}\"".format(filter_parameter, filter_value)
    output = {}
    if gene:
        pattern = re.compile("(?<=gene_id \")[\d\w]*")
    else:
        pattern = re.compile("(?<=transcript_id \")[\d\w]*")
    # check if you're on linux or Mac to know whether you need the -P flag
    platform = sys.platform
    if platform == "linux" or platform == "linux2":
        relevant_lines = hk.run_process(
            ["grep", "-P", r"\t{0}\t".format(element),
             file_name]).rstrip("\n").split("\n")
    elif platform == "darwin":
        relevant_lines = hk.run_process(
            ["grep", r"\t{0}\t".format(element),
             file_name]).rstrip("\n").split("\n")
    for line in relevant_lines:
        line = line.split("\t")
        if len(line) > 1:
            # if you need to filter by parameter
            if filter_pattern in line[8]:
                # convert start and end coordinates to integers
                line[3] = int(line[3])
                line[4] = int(line[4])
                # get identifier (transcript or gene)
                idn = re.search(pattern, line[8]).group()
                if idn not in output:
                    output[idn] = []
                output[idn].append(line)
    return (output)
def get_control_sites(fasta, genome, feature_set, families_file, dataset,
                      temp_motifs_file, hit_file, control_file, error_file,
                      anc_CG_file, high_CG_file, flags):
    '''
    Given motifs and sequences, pick control sites using the optimization method.
    '''
    arguments = [
        "python3", "pick_control_sites.py", fasta, genome, feature_set,
        families_file, dataset, temp_motifs_file, 10, hit_file, 500, 10,
        control_file, error_file, "None", anc_CG_file, high_CG_file,
        "--old_motif_format"
    ]

    arguments = arguments + flags
    run_process(arguments)
Example #12
0
def get_MSA(coords, method, species_set, query_species, version, force_strand = True):
    '''
    Get the genome alignments that overlap a particular sequence region.
    '''
    reverse = False
    if coords[6] == "-" and force_strand:
        reverse = True
    MSA = run_process(["perl", "MSA.pl", method, species_set, version, coords[0], coords[2], coords[3], query_species])
    MSA = MSA.split("|||")
    MSA = [i.split(">") for i in MSA if i]
    MSA = [[j.split("\n") for j in i if j] for i in MSA]

    MSA_dict = {}
    for gab in MSA:
        for species in gab: 
            name = species[0]
            temp_name = name.split("/")
            true_name = temp_name[0]
            coords = "-".join(temp_name[1:])
            if true_name not in MSA_dict:
                MSA_dict[true_name] = {}
            current_seq = "".join(species[1:]).upper()
            if reverse:
                current_seq = Seq(current_seq, IUPAC.unambiguous_dna)
                current_seq = current_seq.reverse_complement()
                current_seq = str(current_seq)
            MSA_dict[true_name][coords] = current_seq       
    return(MSA_dict)
Example #13
0
def get_pp(outroot, subst_model, phy_file, model_file, separate_to_concat_mapping, combined_dict, tuples_mapping, min_inf = None, parse_output = True):
    '''
    Get prior probabilities for all the bases at the different sites in an MSA. Note that for phyloFit these
    are posterior probabilities but theyare priors for INSIGHT.
    '''
    #you don't want to compute a tree, just get the posterior probabilities for an existing tree
    #hence all the flags from --post_probs onwards
    arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model,
                           "--msa-format", "PHYLIP", "--post-probs", "--scale-only", "--no-rates", "--no-freqs", phy_file]
    if min_inf:
        arguments.extend(["-I", min_inf])
    results = run_process(arguments)
    #parse into convenient dictionary
    if parse_output:
        pp_file = "{0}.postprob".format(outroot)
        pp = rw.read_many_fields(pp_file, " ")
        pp = [[j for j in i if j] for i in pp]
        #the outgroup nodes are labelled from the inside out, starting from 1
        pp = {i[1]: i[-4:] for i in pp}
        pp_final = {}
        #map from coordinates in the concatenated alignment to positions in individual CDSs
        for trans in separate_to_concat_mapping:
            pp_final[trans] = {}
            for position in combined_dict[trans]:
                pp_final[trans][position] = pp[tuples_mapping[separate_to_concat_mapping[trans][position]]]
        return(pp_final)
Example #14
0
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file):
    '''
    Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara.
    '''
    #write the coordinates to file in a way that can be read by the downstream perl script
    with open(coords_file, "w") as file:
        for feature in coords:
            feature = [str(i) for i in feature]
            feature = "|".join(feature)
            file.write(feature)
            file.write("\n")
    remove_file(output_file)
    #get the alignments from the database
    run_process(["perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version])
    #parse them from the output file produced by the perl script
    with open(output_file) as file:
        string = "".join(file)
    string = string.split("***")
    string = [(i.rstrip("\n")).lstrip("\n") for i in string]
    string = [i.split("|||") for i in string]
    string = [[j for j in i if len(j) > 0] for i in string]
    #getting rid of cases where there's multiple GABs
    string = flatten([i for i in string if len(i) == 1])
    #write alignments to a pretty FASTA
    with open(output_file, "w") as file:
        for feature in string:
            temp = feature.split("\n")
            name = temp[0]
            name = name.split("|")
            antisense = False
            if name[6] == "-":
                antisense = True
            try:
                alignments = [temp[2], temp[3]]
                alignments = [i.split(" ") for i in alignments]
                #covert to upper case
                alignments = [([j for j in i if j][1]).upper() for i in alignments]
                #only keep alignments with no ambiguous bases in either sequence
                if "N" not in alignments[0] and "N" not in alignments[1]:
                    #reverse complement, if necessary
                    if antisense:
                        alignments = [str(Seq(i, IUPAC.unambiguous_dna).reverse_complement()) for i in alignments]
                    file.write(">{0}\n".format("|".join(name)))
                    file.write("|".join(alignments))
                    file.write("\n")
            except IndexError:
                pass
Example #15
0
def MSA_names(method, species_set, version):
    '''
    Given a Compara WGA method, a species set name and an ensembl db version, get the names of all the species in the set.
    '''
    names = run_process(["perl", "MSA_names.pl", method, species_set, version])
    names = names.rstrip(",")
    names = names.split(",")
    return(names)
Example #16
0
def MSA_names(method, species_set, version):
    '''
    Given a Compara WGA method, a species set name and an ensembl db version, get the names of all the species in the set.
    '''
    names = run_process(["perl", "MSA_names.pl", method, species_set, version])
    names = names.rstrip(",")
    names = names.split(",")
    return (names)
Example #17
0
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None,
                             force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None,
                  write_zero = False, write_bed = False, exclude = False):
    '''Use bedtools/bedops to intersect coordinates from two bed files.
    Return those lines in bed file 1 that overlap with intervals in bed file 2.
    OPTIONS
    output_file: write output to this file
    use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below.
    overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the
    overlap has to be at least 80% of the intervals in bed file 1).
    overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as
    the threshold indicated in -f.
    write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the
    interval from bed file 2 that it overlaps (only
    valid when using bedtools).
    exclude: if True, report intervals that DON'T overlap
    sort: sort bed files before taking the intersection
    force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools)
    force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands
    no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools)
    no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in
    bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2)
    chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency)
    intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2.
    hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools)
    intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first
    write_zero: like write_both but also write A intervals that don't overlap with any B intervals,
    write_bed: when intersecting a bam file, write output as bed.'''
    if force_strand and force_opposite_strand:
        raise Exception("force_strand and force_opposite_strand can't both be True")
    hk.make_dir("temp_data/")
    temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random())
    #have it write the output to a temporary file
    if use_bedops:
        bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec)
    else:
        bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude)
    #move it to a permanent location only if you want to keep it
    if output_file:
        hk.run_process(["mv", temp_file_name, output_file])
    else:
        bedtools_output = rw.read_many_fields(temp_file_name, "\t")
    hk.remove_file(temp_file_name)
    return(bedtools_output)
Example #18
0
def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file):
    '''
    Given a list of lists of CDS coordinates, retrieve the Compara MSAs.
    '''
    with open(coords_file, "w") as file:
        for trans in coords:
            for exon in coords[trans]:
                phase = exon[1]
                current_coords = exon[0]
                current_coords = [str(i) for i in current_coords]
                current_coords.append(str(phase))
                current_coords = "|".join(current_coords)
                file.write(current_coords)
                file.write("\n")
    remove_file(MSA_file)
    run_process(["perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file])
    with open(MSA_file) as file:
        string = "".join(file)
    string = re.sub("([a-z])\n([a-z])", "\\1\\2", string)
    with open(MSA_file, "w") as file:
        file.write(string)
Example #19
0
def chi_test(observed, expected):
    '''
    Given a series of observed and expected values, conduct a chi-squared test.
    '''
    observed = ",".join([str(i) for i in observed])
    expected = ",".join([str(i) for i in expected])
    string_to_R = "|".join([observed, expected])
    output = run_process(["Rscript", "R_scripts/chi_test.r", string_to_R])
    output = output.split(" ")
    output = [i for i in output if i != ""]
    chi = float((output[1].lstrip("\"")).rstrip("\""))
    p = float(((output[4].lstrip("\"")).rstrip("\n")).rstrip("\""))
    return({"chi": chi, "p": p})
Example #20
0
def fishers_exact_test(observed, expected):
    '''
    Perform a Fisher's exact test on an observed and an expected proportion
    '''
    string_to_R = ",".join([str(observed[0]), str(observed[1]), str(expected[0]), str(expected[1]), "greater"])
    results = run_process(["Rscript", "R_scripts/fisher_test.r", string_to_R])
    results = results.rstrip("\"\n")
    #sometimes there's a space between the quotation marks and the newline
    results = results.rstrip("\" \n")
    results = results.split(" ")
    results = [(i.rstrip("\"")).lstrip("\"") for i in results if i != ""]
    results = results[1:]
    results = [float(i) if "Inf" not in i else i for i in results]
    return(results)
Example #21
0
def correct_multiple_testing(p_values, method):
    '''
    Given a list of p-values, correct them for multiple testing.
    '''
    p_values = [str(i) for i in p_values]
    p_values.append(method)
    string_to_R = ",".join(p_values)
    corrected_values = run_process(["Rscript", "R_scripts/holm_correct.r", string_to_R])
    corrected_values = re.findall("[\d\.]*", corrected_values, re.MULTILINE)
    corrected_values = [float(i) for i in corrected_values if "." in i]
    if (len(p_values)) - 1 != len(corrected_values):
        print("Problem correcting for multiple comparisons!")
        print(p_values)
        print(corrected_values)
        sys.exit()
    return(corrected_values)
Example #22
0
def wilcoxon_signed_rank_test(vector1, vector2, alt):
    '''
    Perform a Mann-Whitney U test to compare two samples. The alternative must be one of
    "greater", "less" and "two.tailed".
    '''
    vector1 = ",".join([str(i) for i in vector1])
    vector2 = ",".join([str(i) for i in vector2])
    vectors = "|".join([vector1, vector2])
    string_to_R = "_".join([vectors, alt])
    results = run_process(["Rscript", "R_scripts/wilcoxon_signed_rank_test.r", string_to_R])
    results = results.rstrip("\n")
    results = results.split(" ")
    results = [i for i in results if i != ""]
    results = (results[1].rstrip("\"")).lstrip("\"")
    results = float(results)
    return(results)
Example #23
0
def run_bedops(A_file, B_file, force_strand = False, force_opposite_strand = False, write_both = False, chrom = None, overlap = None, sort = False, output_file = None, intersect = False, hit_number = None, no_dups = False, overlap_rec = None, intersect_bam = None):
    '''
    See intersect_bed for details.
    '''
    if intersect:
        command = "--intersect"
    else:
        command = "--element-of"
    if sort:
        sort_bed(A_file, A_file)
        sort_bed(B_file, B_file)
    bedops_args = ["bedops", "--chrom", "foo", command, "1", A_file, B_file]
    if overlap:
        bedops_args[4] = overlap
    if chrom:
        bedops_args[2] = chrom
        if intersect:
            del bedops_args[4]
    else:
        del bedops_args[1:3]
        if intersect:
            del bedops_args[2]
    if force_strand:
        print("Bedops can't search by strand! Either use bedtools or separate input data by strand!")
        raise Exception
    if force_opposite_strand:
        print("Bedops can't search by strand! Either use bedtools or separate input data by strand!")
        raise Exception
    if write_both:
        print("Bedops can't write both features!")
        raise Exception
    if hit_number:
        print("Bedops hasn't been set up to count the number of overlapping elements. Use bedtools!")
        raise Exception
    if no_dups:
        print("Bedops doesn't print duplicates by default!")
    if overlap_rec:
        print("Bedops hasn't been set up to filter by overlap in second file!")
    if intersect_bam:
        print("Use bedtools to intersect bam and bed!")
        raise Exception
    bedops_output = hk.run_process(bedops_args, file_for_output = output_file)
    return(bedops_output)
Example #24
0
def get_lambda(lambda_file_outroot, phy_file, subst_model, min_inf = None):
    '''
    Calculate lambda input parameter for INSIGHT.
    '''
    lambda_file = "{0}.mod".format(lambda_file_outroot)
    #to make sure you catch it if the phyloFit process fails
    remove_file(lambda_file)
    #from UCSC
    tree_file = "DFE/UCSC_model.mod"
    #subst_model is JC69, for instance
    #scale-only, cause you don't want it to estimate a new tree, just to scale the whole thing
    arguments = ["phyloFit", "--init-model", tree_file, "--out-root", lambda_file_outroot, "--subst-mod", subst_model,
                           "--msa-format", "PHYLIP", "--scale-only", phy_file]
    #must be set to False for testing
    if min_inf:
        arguments.extend(["-I", min_inf])
    results = run_process(arguments)
    with open(lambda_file) as file:
        lambda_b = file.read()
    lambda_b = re.findall(lambda_regex, lambda_b)[0]
    return(lambda_b)
Example #25
0
def fishers_exact_test_enrichment(element, sample, population, alt):
    '''
    Perform a Fisher's exact test to check whether a given element is enriched in a sample when compared to a population.
    '''
    N = len(population)
    n = len(sample)
    if len(sample) >= len(population):
        print("The sample has to be smaller than the population!")
        raise Exception
    K = population.count(element)
    k = sample.count(element)
    string_to_R = ",".join([str(k), str(n - k), str(K), str(N - K), alt])
    results = run_process(["Rscript", "R_scripts/fisher_test.r", string_to_R])
    results = results.rstrip("\"\n")
    #sometimes there's a space between the quotation marks and the newline
    results = results.rstrip("\" \n")
    results = results.split(" ")
    results = [(i.rstrip("\"")).lstrip("\"") for i in results if i != ""]
    results = results[1:]
    results = [float(i) if "Inf" not in i else i for i in results]
    return(results)
Example #26
0
def get_MSA(coords,
            method,
            species_set,
            query_species,
            version,
            force_strand=True):
    '''
    Get the genome alignments that overlap a particular sequence region.
    '''
    reverse = False
    if coords[6] == "-" and force_strand:
        reverse = True
    MSA = run_process([
        "perl", "MSA.pl", method, species_set, version, coords[0], coords[2],
        coords[3], query_species
    ])
    MSA = MSA.split("|||")
    MSA = [i.split(">") for i in MSA if i]
    MSA = [[j.split("\n") for j in i if j] for i in MSA]

    MSA_dict = {}
    for gab in MSA:
        for species in gab:
            name = species[0]
            temp_name = name.split("/")
            true_name = temp_name[0]
            coords = "-".join(temp_name[1:])
            if true_name not in MSA_dict:
                MSA_dict[true_name] = {}
            current_seq = "".join(species[1:]).upper()
            if reverse:
                current_seq = Seq(current_seq, IUPAC.unambiguous_dna)
                current_seq = current_seq.reverse_complement()
                current_seq = str(current_seq)
            MSA_dict[true_name][coords] = current_seq
    return (MSA_dict)
Example #27
0
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False):
    '''
    Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC.
    '''
    #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine
    #CpG positions again, otherwise just read them in from the file
    if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name):
        #you need several in case you have a high_CG dictionary
        pps = []
        for phy_file in phy_files:
            if subst_model == "JC69" or from_model:
                #use an existing substitution model
                arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file]
            else:
                #estimate a new model
                arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file]
                
            if subst_model == "JC69":
                block_size = 4
                tuple_pos_lim = 2
                shift_in_tuple = 0
            else:
                #for dinucleotide models
                block_size = 16
                tuple_pos_lim = 3
                shift_in_tuple = 9

            #turn off when testing                        
            if min_inf:
                arguments.extend(["-I", min_inf])
            results = run_process(arguments)
            #read in posterior probabilities of having various nucelotides ancestrally
            pp_file = "{0}.postprob".format(outroot)
            pp = rw.read_many_fields(pp_file, " ")
            pp = [[j for j in i if j] for i in pp]
            pp = pp[2:]
            #the posterior probability that you had a CpG at a position has to be greater
            #than threshold for a position to be counted as ancestrally CpG
            threshold = 0.5
            #will be over-written if you're doing big tree
            human_pos = 0
            #the outgroup nodes are labelled from the outside in, starting from 1
            if macaque:
                #it's to know whether we're doing big tree or little tree
                if len(pp[0]) == 14:
                    #little tree, mononucleotide
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp}
                elif len(pp[0]) > 14:
                    #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree)
                    #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first
                    human_pos = 3 + shift_in_tuple
                    if comprehensive:
                        #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp}
                    else:
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp}
                else:
                    #for tests etc. where you might only have, say, two species
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            else:
                pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            pps.append(pp)
        anc_CG = {}
        #just to get the length
        example_pp = pps[0][list(pps[0].keys())[0]]
        for trans in tuples_mapping_dict:
            #tuples_mapping_dict has the alignment tuple corresponding to each position
            #because the phyloFit output is organized by tuples, not by positions
            anc_CG[trans] = []
            for node_pos in range(len(example_pp)):
                #if you're using dinucleotides
                if subst_model != "JC69":
                    for pos in sorted(tuples_mapping_dict[trans].keys())[1:]:
                        try:
                            pp_number = 0
                            #if you're gonna produce different output dictionaries for high and low GC regions
                            if high_CG:
                                if pos in high_CG[trans]:
                                    pp_number = 1
                            current_tuple = tuples_mapping_dict[trans][pos]
                            #don't consider positions where there is an alignment gap for human
                            if current_tuple[human_pos] != "*":
##                                print(current_tuple)
##                                print(pps[pp_number])
##                                print("\n")
                                if current_tuple in pps[pp_number]:
                                    current_pp = pps[pp_number][current_tuple][node_pos]
                                else:
                                    current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos]
                                #because it can be either GC or CG, hence 6 or 9
                                if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold:
                                    #you're always testing the second member in the dinucleotide
                                    anc_CG[trans].append(pos - 1)
                                    anc_CG[trans].append(pos)
                        except KeyError:
                            if pos % 100 == 0:
                                pass
                            else:
                                raise KeyError
                else:
                    #if you're using mononucleotides, you have to keep track of what the previous neuclotide was
                    C_prev = False
                    G_prev = False
                    for pos in sorted(tuples_mapping_dict[trans].keys()):
                        pp_number = 0
                        if high_CG:
                            if pos in high_CG[trans]:
                                pp_number = 1
                        current_C = False
                        current_G = False
                        current_tuple = tuples_mapping_dict[trans][pos]
                        if current_tuple[human_pos] != "*":
                            current_pp = pps[pp_number][current_tuple][node_pos]
                            #if current is C and previous was G
                            if float(current_pp[1]) > threshold:
                                if G_prev:
                                    anc_CG[trans].append(G_pos)
                                    anc_CG[trans].append(pos)
                                current_C = True
                            #if current is G and previous was C
                            if float(current_pp[2]) > threshold:
                                if C_prev:
                                    anc_CG[trans].append(C_pos)
                                    anc_CG[trans].append(pos)
                                current_G = True
                            C_prev = False
                            G_prev = False
                            if current_C:
                                C_prev = True
                                #you need to specify the position explicitly because it's not necessarily
                                #the last one if there were dashes
                                C_pos = pos
                            if current_G:
                                G_prev = True
                                G_pos = pos
            anc_CG[trans] = sorted(list(set(anc_CG[trans])))
        remove_file(pp_file)
        if anc_CG_file_name and anc_CG_file_name != "None":
            with open(anc_CG_file_name, "w") as file:
                for trans in anc_CG:
                    to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])])
                    file.write(to_write)
                    file.write("\n")
    else:
        #parse
        anc_CG = rw.read_many_fields(anc_CG_file_name, "\t")
        anc_CG = [i for i in anc_CG if len(i) == 2]
        anc_CG = list_to_dict(anc_CG, 0, 1)
        anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG}
    return(anc_CG)
Example #28
0
def MSA_filter_by_anatomy(input_file, output_file, version):
    '''
    Given an output file from get_MSA_concat_list, filter the CDSs based on whether the exon coordinates have been conserved.
    '''
    run_process(["perl", "MSA_CDSs.pl", version, input_file, output_file])
def main():
    description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls."
    args = parse_arguments(description, [
        "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file",
        "SFS_file", "trial_file", "trials", "shuffle"
    ],
                           ints=[6],
                           flags=[7])
    hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle

    true_hits = rw.read_pos(hit_file)
    true_controls = rw.read_pos(control_file)

    #to store the original data in case this is a negative control and you will be shuffling
    #hits and controls
    original_INSIGHT_hit_file = INSIGHT_hit_file
    original_INSIGHT_control_file = INSIGHT_control_file

    print(hit_file)

    with open(trial_file, "w") as file:
        file.write(
            "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n"
        )
        for trial in range(trials):
            to_write = "{0}\t".format(trial)

            #if this is a negative control
            if shuffle:
                INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial),
                                          original_INSIGHT_hit_file)
                INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial),
                                              original_INSIGHT_control_file)
                temp_hits_file = "temp_data/temp_hits{0}.txt".format(
                    random.random())
                temp_controls_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                #shuffle hits and controls
                temp_hits, temp_controls = shuffle_dictionaries(
                    true_hits, true_controls)
                rw.write_pos(temp_hits, temp_hits_file)
                rw.write_pos(temp_controls, temp_controls_file)
                SFS_file = "temp_data/temp_SFS_file{0}.txt".format(
                    random.random())
                #generate an ISNIGHT input file that you could then use for the manual analysis
                run_process([
                    "python3", "mDFEest_input.py", temp_hits_file,
                    temp_controls_file,
                    "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt",
                    216, SFS_file
                ])
                remove_file(temp_hits_file)
                remove_file(temp_controls_file)

            hit_data = get_data(INSIGHT_hit_file)
            control_data = get_data(INSIGHT_control_file)

            poly_ratio_diff = get_chisq_site_freq(hit_data, control_data)
            to_write = to_write + "{0}\t".format(poly_ratio_diff)

            temp, median_diff = get_mean_freq(SFS_file)
            to_write = to_write + "{0}\n".format(median_diff)

            if shuffle:
                remove_file(SFS_file)

            file.write(to_write)
Example #30
0
def main():
    description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \
                  "as the true set."
    args = hk.parse_arguments(description, [
        "active_genes_file", "gtf", "PolII_file", "fasta", "outfile",
        "chrom_sizes"
    ])
    active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes

    chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t")
    chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True)

    # get transcriptionally active genes and make a BED file with their coordinates
    print("Getting the coordinates of transcriptionally active genes...")
    trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:]
    trans_active_genes = [i[3] for i in trans_active_genes]
    transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)

    transcripts_dict = {}
    # this will be used for getting the k-mers in the transcripts
    filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format(
        transcripts_file[:-4])
    # this will be used for filtering the reads
    filtered_transcripts_file = "{0}_trans_act_only.bed".format(
        transcripts_file[:-4])
    with open(filtered_transcripts_file,
              "w") as ft_file, open(transcripts_file) as t_file, open(
                  filtered_transcripts_file_plus2, "w") as ft_file2:
        reader = csv.reader(t_file, delimiter="\t")
        writer = csv.writer(ft_file, delimiter="\t")
        writer2 = csv.writer(ft_file2, delimiter="\t")
        for line in reader:
            if line[3] in trans_active_genes:
                # if line[0][0] not in ["G", "K"]:
                #     line[0] = "chr{0}".format(line[0])
                writer.writerow(line)
                # this is because if a read falls at the first position, you will need to know the
                # preceding two bases. Same if it falls at the last position.
                line[1] = str((int(line[1])) - 3)
                line[2] = str((int(line[2])) + 3)
                writer2.writerow(line)
                transcripts_dict[line[3]] = line

    print("Filtering reads to the transcripts...")
    # filter reads to only ones that overlap these transcripts
    transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4])
    co.intersect_bed(PolII_file,
                     filtered_transcripts_file,
                     force_strand=True,
                     output_file=transcripts_PolII)

    print("Extracting FASTA from the transcript coordinates...")
    # the genome FASTA is formatted as N rather than chrN
    filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format(
        transcripts_file[:-4])
    hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2],
                   file_for_output=filtered_transcripts_file_no_chr)
    filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format(
        transcripts_file[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed",
        filtered_transcripts_file_no_chr, "-fo",
        filtered_transcripts_fasta_no_chr, "-s", "-name"
    ])

    print("Mapping kmers to transcript positions...")
    kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr,
                                       k=6,
                                       focal_pos=3)

    print("Extracting the starting dinucleotide for each read...")
    starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format(
        PolII_file[:-4])
    starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format(
        PolII_file[:-4])
    co.extend_intervals(transcripts_PolII,
                        starting_dints_PolII,
                        3,
                        3,
                        remove_chr=True)
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII,
        "-fo", starting_dints_PolII_fasta, "-s"
    ])

    print("Picking random control positions...")
    pick_random_positions(transcripts_PolII,
                          starting_dints_PolII_fasta,
                          outfile,
                          kmer_dict,
                          transcripts_dict,
                          chrom_sizes=chrom_sizes)

    print("Making single nucleotide resolution file...")
    snr_file = "{0}_snr.bed".format(outfile[:-4])
    co.snr_bed(outfile, snr_file)

    print(
        "Removing reads that overlap potential splice intermediate positions..."
    )
    no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4])
    co.intersect_bed(snr_file,
                     "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf",
                     force_strand=True,
                     exclude=True,
                     no_dups=False)
Example #31
0
def main():
    description = "Write out a BED file with the region surrounding the TSS for a set of genes."
    args = hk.parse_arguments(
        description,
        ["genes_file", "gtf", "outfile", "start_coord", "end_coord"],
        ints=[3, 4])
    genes_file, gtf, outfile, start_coord, end_coord = args.genes_file, args.gtf, args.outfile, args.start_coord, args.end_coord

    need_to_seek = False
    if genes_file[-3:] != "bed":
        # it means that you got a list of gene symbols rather than a
        # BED file with coordinates
        need_to_seek = True
        transcript_file = "{0}_transcripts.gtf".format(gtf[:-4])
        co.get_transcripts(gtf,
                           transcript_file,
                           add_chr=False,
                           with_detail=False,
                           output_gtf=True)

    with open(genes_file) as gf, open(outfile, "w") as of:
        reader = csv.reader(gf, delimiter="\t")
        writer = csv.writer(of, delimiter="\t")
        for line in reader:
            if need_to_seek:
                gene = line[0]
                print(gene)
                possibilities = hk.run_process([
                    "grep", "gene_symbol \"\"{0}\"\"".format(gene),
                    transcript_file
                ])
                possibilities = [
                    i.split("\t") for i in possibilities.split("\n")[:-1]
                ]
                chrom = "chr{0}".format(possibilities[0][0])
                strand = possibilities[0][6]
                starts = [i[3] for i in possibilities]
                ends = [i[4] for i in possibilities]
                if strand == "+":
                    counts = Counter(starts)
                    start = int(counts.most_common()[0][0])
                    end = ends[starts.index(str(start))]
                    length = int(end) - start
                    new_end_coord = min(length, end_coord)
                    new_start = str(start - start_coord - 1)
                    new_end = str(start + (new_end_coord - 1))
                elif strand == "-":
                    counts = Counter(ends)
                    end = int(counts.most_common()[0][0])
                    start = starts[ends.index(str(end))]
                    length = end - int(start)
                    new_end_coord = min(length, end_coord)
                    new_start = str(end - new_end_coord)
                    new_end = str(end + start_coord)
                new_line = [chrom, new_start, new_end, gene, ".", strand]
                writer.writerow(new_line)
            else:
                if line[0] != "chrom":
                    length = int(line[2]) - int(line[1])
                    curr_end_coord = min(length, end_coord)
                    if line[5] == "+":
                        new_start = str(int(line[1]) - start_coord)
                        new_end = str(int(line[1]) + curr_end_coord)
                    elif line[5] == "-":
                        new_start = str(int(line[2]) - curr_end_coord)
                        new_end = str(int(line[2]) + start_coord)
                    else:
                        raise Exception("Invalid strand!")
                    new_line = line.copy()
                    new_line[1] = new_start
                    new_line[2] = new_end
                    # to make it a BED6
                    new_line = new_line[:-2]
                    writer.writerow(new_line)
Example #32
0
def MSA_filter_by_anatomy(input_file, output_file, version):
    '''
    Given an output file from get_MSA_concat_list, filter the CDSs based on whether the exon coordinates have been conserved.
    '''
    run_process(["perl", "MSA_CDSs.pl", version, input_file, output_file])
Example #33
0
def main():
    description = "Run mDFEest."
    args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10])
    hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change

    #if you want to generate a new input file rather than reading in an existing one
    if new_input:
        remove_file("../multidfe/{0}".format(input_file.split("/")[-1]))
        arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file]
        if shuffle:
            arguments.append("--shuffle")
        run_process(arguments)
    
    if seed == "None":
        seed = None
    else:
        seed = float(seed)

    #if you want to run it only with a population size change model,
    #rather than both a model assuming population size change and a fixed population
    #size model
    if fix_pop_change:
        pop_change = [True]
    else:
        pop_change = [False, True]

    if fixed_model == "None":
        #all possible models
        allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"]
        spike_range = [2, 6]
    else:
        #only the spcified model
        allowed = [fixed_model]
        #only two-spike models
        spike_range = [2, 3]

    with open(output_file, "w") as file:
        file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n")
        for change_mode in pop_change:
    
            print("\nPopulation expansion: {0}.".format(str(change_mode)))

            if "lognormal" in allowed:
                print("lognormal model:")
                output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "gamma" in allowed:
                print("gamma model:")
                output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "beta" in allowed:
                print("beta model:")
                output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            for spike_number in range(spike_range[0], spike_range[1]):

                if "spikes" in allowed:
                    print("{0}-spikes model:".format(spike_number))
                    output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

                if "steps" in allowed:
                    print("{0}-steps model:".format(spike_number))
                    output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

            if "fixed six spikes" in allowed:
                print("fixed six spikes model:")
                output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)
Example #34
0
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None):
    '''
    Wraps call to multiDFEest.
    '''
    flags = []

    if fold_SFS:
        fold_SFS = 1
    else:
        fold_SFS = 0
    #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size
    if pop_change:
        pop_change = 0
    else:
        pop_change = 1

    #convert the English distribution names into multiDFEest model codes
    if model == "lognormal":
        model_code = 4
        #parameter number for calculating AIC
        par_number = 2
    elif model == "gamma":
        model_code = 2
        par_number = 2
    elif model == "beta":
        model_code = 3
        par_number = 2
    elif model == "spikes":
        model_code = 0
        if not n_spikes:
            print("To be able to use a spikes model, you need to specify the number of spikes.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "steps":
        model_code = 1
        if not n_spikes:
            print("To be able to use a steps model, you need to specify the number of steps.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "six_spikes":
        model_code = 5
        par_number = 5
        flags = ["-ranrep", repetitions]
    else:
        print("{0} is not a valid model name!".format(model))
        raise Exception

    input_file_short = input_file.split("/")
    input_file_short = input_file_short[-1]

    #do the analysis in the directory where multiDFEest is stored
    if not os.path.exists("../multidfe/{0}".format(input_file_short)):
        run_process(["cp", input_file, "../multidfe"])
    MDE_output = "{0}.MAXL.out".format(input_file_short)
    current_dir = os.getcwd()
    os.chdir("../multidfe")
    arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short]
    if seed:
        seed_string = "GSL_RNG_SEED={0}".format(seed)
        arguments = [seed_string] + arguments
    arguments.extend(flags)
    print(" ".join([str(i) for i in arguments]))
    #run multiDFEest
    run_process(arguments)
    #parse output
    output = rw.read_many_fields(MDE_output, "\t")[0]
    output = [i.split(":") for i in output if ":" in i]
    output = {i[0]: float(i[1]) for i in output}
    #get the log likelihood and calculate AIC
    ll = output["L"]
    print("\n")
    print(par_number)
    print(ll)
    AIC = (2 * par_number) - (2 * ll)
    output["AIC"] = AIC
    if n_spikes:
        output["model"] = "{0}_{1}".format(model, n_spikes)
    else:
        output["model"] = model
    remove_file(MDE_output)
    os.chdir(current_dir)
    return(output)
Example #35
0
def get_ss_strength(exons,
                    genome_file,
                    upstream=True,
                    five=True,
                    exonic=3,
                    intronic=6):
    """
    Given a set of exons, get an estimate of splice site strength.
    :param exons: Dictionary of CDS lines.
    :param genome_file: File with genome sequence.
    :param upstream: evaluate the (5' or 3') splice site of the upstream intron (rather than downstream)
    :param five: evaluate the 5' splice site (rather than 3')
    :param exonic: how many nucleotides to include from the exon
    :param intronic: how many nucleotides to include from the intron
    :return: a dictionary with the splice site strength for each exon
    """
    # will contain the splice site strengths
    out_dict = {}
    # will contain the names of the exons so that later on, we'd know which
    # splice site strength value goes with which exon
    names = []

    # write splice site coordinates to GTF
    hk.make_dir("temp_data")
    temp_file_name = "temp_data/ss_sequences.gtf"
    with open(temp_file_name, "w") as temp_file:
        writer = csv.writer(temp_file, delimiter="\t")
        for transcript in exons:
            curr_exons = exons[transcript]
            for pos, exon in enumerate(curr_exons):
                # don't analyze first exons
                if (pos != 0):
                    # cause you can't do the downstream intron of the last exon
                    if (upstream or (pos != len(curr_exons) - 1)):
                        if five:
                            if upstream:
                                template = curr_exons[pos - 1].copy()
                            else:
                                template = exon.copy()
                            if template[6] == "+":
                                template[3] = template[4] - exonic + 1
                                template[4] = template[4] + intronic
                            elif template[6] == "-":
                                template[4] = template[3] + exonic - 1
                                template[3] = template[3] - intronic
                        else:
                            if upstream:
                                template = exon.copy()
                            else:
                                template = curr_exons[pos + 1].copy()
                            if template[6] == "+":
                                template[4] = template[3] + exonic - 1
                                template[3] = template[3] - intronic
                            elif template[6] == "-":
                                template[3] = template[4] - exonic + 1
                                template[4] = template[4] + intronic
                        # this is for scaffolds etc.
                        if template[3] >= 0:
                            # so you'd know the order of the values in the MaxEntScan output
                            names.append("{0}.{1}".format(transcript, pos - 1))
                            writer.writerow(template)

    # make a FASTA with splice site sequences
    temp_fasta_file_name = "{0}.fasta".format(temp_file_name[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", genome_file, "-bed", temp_file_name,
        "-fo", temp_fasta_file_name, "-s"
    ])
    # filter FASTA for Ns
    fasta_lines = []
    with open(temp_fasta_file_name) as fasta:
        for line in fasta:
            if line[0] == ">":
                curr_name = line
            else:
                if "N" not in line:
                    fasta_lines.append(curr_name)
                    fasta_lines.append(line)
    with open(temp_fasta_file_name, "w") as fasta:
        for line in fasta_lines:
            fasta.write(line)

    # run MaxEntScan on the FASTA
    # lazy hardcoded path, replace as appropriate...
    mes_direct = "/Users/rsavisaar/Software/MaxEntScan/fordownload"
    if five:
        cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score5.pl"
    else:
        cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score3.pl"
    temp_mes_file_name = "{0}_mes.txt".format(temp_file_name[:-4])
    hk.run_process(["perl", cmd, temp_fasta_file_name],
                   file_for_output=temp_mes_file_name,
                   verbose=True)
    hk.remove_file(temp_fasta_file_name)
    hk.remove_file(temp_file_name)

    # read in splice site scores and store in output directory
    with open(temp_mes_file_name, newline="") as mes_file:
        reader = csv.reader(mes_file, delimiter="\t")
        for pos, line in enumerate(reader):
            out_dict[names[pos]] = float(line[1])
    hk.remove_file(temp_mes_file_name)
    return (out_dict)