Ejemplo n.º 1
0
def main(infile, out_dir, gaps):
    records = st.fasta_to_dct(infile)
    in_dir, in_fn = os.path.split(infile)[0], os.path.split(infile)[1]

    gly_fn = out_dir + in_fn[:in_fn.rfind(".")] + "_glycans_pos.out"
    bin_fn = out_dir + in_fn[:in_fn.rfind(".")] + "_glycans_binary.out"

    gly_fw = open(gly_fn, "w")
    bin_fw = open(bin_fn, "w")

#    regex_pattern = 'N[\-]*[^P\-][\-]*[TS]'
    regex_pattern = 'N(?=[\-]*[^P][\-]*[TS])'

    summary_gly_pos = []
    summary_binary = []

    for key, seq in records.items():
        sites = get_glycan_sites(seq, regex_pattern, gaps)
        binary_sites = get_binary_sites(seq, regex_pattern)

        gly_fw.write(">"+key+"\n"+str(sites)+"\n")
        bin_fw.write(">"+key+"\n"+str(binary_sites)+"\n")

        if len(sites) > 1:
            summary_gly_pos.append(sites)
            summary_binary.append(binary_sites)

    gly_fw.close()
    bin_fw.close()
    
    print("end")
def main(infile, outdir, case_sens):
    """
    takes in an aligned fasta file, and an output dir.
    Reduces the input fasta file to only colunns with differences.
    Writes that to file in the output directory - with the same filename as the input file, except with
    "_condensed.fasta" extension.
    Can ignore case.
    :param infile: the fasta file to use as source
    :param outdir: the place to write the output to.
    :param case_sens: bool indicates if the case of the characters in the sequences counts.
    :return: no return
    """

    # confirm that there are no duplicate names in the infile.
    seqids = []
    with open(infile, "r") as fh:
        for line in fh:
            if (line[0] == ">"):
                seqids.append(line[1:])
    if len(list(set(seqids))) != len(seqids):
        print(
            "It appears that there are duplicate sequenceIDS in your fasta format file.\nNote that we cannot "
            "currently handle duplicate sequenceIDS.\nPlease ensure unique IDs and try again.\nNow exiting."
        )
        sys.exit(1)

    dct = st.fasta_to_dct(infile)
    # confirm that there is at least one sequence in the infile.
    if len(dct) < 1:
        print(
            "It appears that there is no readable fasta format data in the infile specified.\nPlease confirm the "
            "input file and try again.\nNow exiting.")
        sys.exit(1)

    seqs = list(dct.values())
    print("Input alignment is {} sites long.".format(len(seqs[0])))

    # account for case sensitivity setting from user
    if not case_sens:
        tmp = []
        for s in seqs:
            tmp.append(s.upper())
        seqs = tmp
        del tmp

    # Check the sequences in the infile are the same length
    if len(set(list(map(len, seqs)))) != 1:
        print(
            "Not all sequences in the input file are the same length.\nPlease make sure you specify an alignment with"
            "equally lengthed sequences.\nNow exiting")
        sys.exit(1)

    # finally, call condense
    condense(infile, outdir, case_sens)

    print("Completed. \nNow exiting")
    sys.exit(0)
def main(in_fn, out_fn, side, prcnt):

    # if side == 'fwd':
    #     print("trimming from the front of the alignment.")
    # elif side == 'rev':
    #     print("trimming from the end of the alignment.")
    # elif side == 'both':
    #     print("trimming from both the front and the end.")

    dct = st.fasta_to_dct(in_fn)
    seqs = list(dct.values())
    seq_count = len(seqs)
    #print("we have %s sequences in the input file. " %seq_count)

    #prcnt = 90
    for i in range(len(seqs[0]) - 1):  #columns
        pos_gap_count = 0
        for j in range(seq_count):  #rows
            if seqs[j][i] == "-":
                pos_gap_count += 1
        if ((pos_gap_count / seq_count * 100.0) < (prcnt)):
            break
    front_trim = i
    #print("counting from the front, we got to col.: %s" %front_trim)

    for i in range(len(seqs[0]) - 1, 0,
                   -1):  #cols, counting down. ie: from back to front.
        pos_gap_count = 0
        for j in range(seq_count):  #rows
            if seqs[j][i] == "-":
                pos_gap_count += 1
        if ((pos_gap_count / seq_count * 100.0) < (prcnt)):
            break
    rev_trim = i
    #print("counting from the end, we got to col.: %s" %rev_trim)

    dct2 = {}
    for k, v in dct.items():
        if side == 'fwd':
            dct2[k] = v[front_trim:]
        if side == 'rev':
            dct2[k] = v[:rev_trim + 1]
        if side == 'both':
            dct2[k] = v[front_trim:rev_trim + 1]

    st.dct_to_fasta(dct2, out_fn)

    #print("ending")
    sys.exit(0)
def condense(file_to_condense, outdir, caseSensitive):
    """
    takes a filename of a fasta file.
    removes all conserved sites
    writes out the reduced file, containing no conserved sites.
    :param d: a fasta file path.
    :param outdir: the place to write the output file to
    :param caseSensitive: bool indicates if the case of the characters in the sequences counts. If True, then we
    should respect the case of the sequences. "a" != "A".
    :return: no return
    """

    # establish filenames
    base_filename = os.path.splitext(os.path.split(file_to_condense)[1])[0]
    out_fn = os.path.join(outdir, base_filename + "_condensed.fasta")

    # read in file into a dictionary
    d = st.fasta_to_dct(file_to_condense)
    seqids, seqs = [], []
    positions_to_pop = []

    for k, v in d.items():
        seqids.append(k)
        if not caseSensitive:
            seqs.append(v.upper())
        else:
            seqs.append(v)

    # step over each site finding sites which have no variability.
    for i in range(len(seqs[0])):  # columns of the sequence.
        pos_j_chars = []
        for j in range(len(seqs)):  # sequences within the file.
            pos_j_chars.append(seqs[j][i])
        if len(set(pos_j_chars)) == 1:
            positions_to_pop.append(i)

    # reverse the list, so we pop from the back
    positions_to_pop = positions_to_pop[::-1]

    for i in positions_to_pop:
        for seqid in d.keys():
            seq = d[seqid]
            mod_seq = seq[:i] + seq[(i + 1):]
            d[seqid] = mod_seq

    seqs = list(d.values())
    print("After removing conserved sites, the alignment has {} sites.".format(
        len(seqs[0])))
    st.dct_to_fasta(d, out_fn)
Ejemplo n.º 5
0
def main(in_fasta, outpath, indx):
    print("")
    dct = st.fasta_to_dct(in_fasta)
    dct_of_dcts = {}

    # Build a dictionary of dictionaries. Inner dictionary belongs to 1 patient.
    # All sequences for 1 patient end up in 1 dictionary.
    for k, v in dct.items():
        pt_id = k.split("_")[indx]
        if pt_id in dct_of_dcts.keys():
            dct_of_dcts[pt_id][k] = v
        else:
            dct_of_dcts[pt_id] = {k: v}

    majority_variants = {}
    for pt_id, inner_dct in dct_of_dcts.items():
        majority_variants[pt_id] = ""
        tmp_dct = {}
        for k, v in inner_dct.items():
            if v in tmp_dct.keys():
                tmp_dct[v] += 1
            else:
                tmp_dct[v] = 1
        majority_variants[pt_id] = max(tmp_dct.items(),
                                       key=operator.itemgetter(1))[0]
    # dictionary of pt_id: sequence
    #print(majority_variants)

    outfile = os.path.join(outpath,
                           "normalized_distances_from_majority_variant.csv")
    with open(outfile, "w") as handle:
        handle.write(
            "participant,Normalised_hamming_distance_adjusted_(changes_per_100_bases),sequence_id\n"
        )

        # for every sequence for a patient, calculate the normalized distance to the majority variant.
        for pt_id, inner_dct in dct_of_dcts.items():
            this_majority = majority_variants[pt_id]
            for k, v in inner_dct.items():
                dist = st.customdist(v, this_majority)
                norm_dist = dist / len(v)
                normadjustdist_perc = round(norm_dist * 100, 2)

                handle.write(
                    ",".join([str(x)
                              for x in [pt_id, normadjustdist_perc, k]]) +
                    "\n")
Ejemplo n.º 6
0
def main(in_fasta, outpath):
    dct = st.fasta_to_dct(in_fasta)
    all_distances = []
    outfile = os.path.join(outpath, "pairwise_distances.csv")
    with open(outfile, "w") as handle:
        handle.write("seq_id1,seq_id2,Normalised_hamming_distance_adjusted_(changes_per_100_bases)\n")

        for k1, v1 in dct.items():
            for k2, v2 in dct.items():
                if k1 != k2:
                    # alignments = pairwise2.align.globalxx(v1, v2)
                    # s1 = alignments[0][0]
                    # s2 = alignments[0][1]
                    dist = st.customdist(v1, v2)
                    norm_dist = dist / len(v1)
                    normadjustdist_perc = round(norm_dist * 100, 2)

                    handle.write(",".join([str(x) for x in [k1, k2, normadjustdist_perc]]) + "\n")
                    all_distances.append(normadjustdist_perc)
    print(sum(all_distances)/len(all_distances))
def main(in_path, outpath, name):

    # set the outfile name
    name = name + "_divergence.csv"
    outfile = pathlib.Path(outpath, name).absolute()

    # write the headings to the outfile
    with open(outfile, "w") as handle:
        handle.write("participant,Normalised_hamming_distance_adjusted_(changes_per_100_bases),sequence_id\n")

    # get files
    in_files = pathlib.Path(in_path).glob("*sep.fasta")
    for file in list(in_files):
        print(file)
        seqs_d = sb.fasta_to_dct(file)
        ref_file = str(file).replace("sep.fasta", "hap.fasta")

        # get ref
        ref_record = next(SeqIO.parse(ref_file, "fasta"))
        ref_seq = str(ref_record.seq)
        ref_name = ref_record.name
        print(ref_name)

        # calculate the divergence from the reference for each sequnce
        for seq_name, seq in seqs_d.items():
            if len(seq) != len(ref_seq):
                print("input sequence and reference sequence were not the same length.")
                sys.exit()
            else:
                participant_id = seq_name.split("_")[0]
                normadjustdist_perc = round((sb.customdist(seq.upper(), ref_seq.upper()) / len(seq)) * 100, 2)

                with open(outfile, "a") as handle:
                    handle.write(f"{participant_id},{normadjustdist_perc},{seq_name}\n")

    print("Divergence calculations are complete")
def main(in_fn, region, ft, out_good_fn, out_bad_fn):
    # example command /uct/dev/code/small_bix_tools$ python3.4 select_HIV_sequences.py -in /uct/dev/source/smallBixTools/swipe_hiv_region_selection/2016_01_27_MurrayLogan_Murray-Logan_SFF_nomatch.fasta -region gag -out_matching /uct/dev/source/smallBixTools/swipe_hiv_region_selection/out_matching -out_nonmatching /uct/dev/source/smallBixTools/swipe_hiv_region_selection/out_nonmatching

    # software location
    # /home/dave/Software/swipe-2.0.5/swipe

    # database: made from makeblastdb. we have made nucleotide databases
    # -d /uct/ref/own_made_blastdb/hxb2_gag/hxb2_gag_blastdb

    # is the databse a protein / nucleotide database. In our case we are using nucleotide - so we use 0 here.
    # -p 0

    # input file. We are using a fasta file for input here
    # -i /uct/dev/source/smallBixTools/swipe_hiv_region_selection/2016_01_27_MurrayLogan_Murray-Logan_SFF_nomatch.fasta

    # number of threads to use. running on a machine which reports to have 8 cores, I run with 7 threads,
    # and it does not max any of them. In fact, each only gets to about 40%. Running 4 instances of swipe,
    # simultaniously on the same machine with 8 cores, each instance being told it can use 7 threads, all the cores
    # get to about 70% each. -- Makes me want to try specifying many more threads (32) on an 8 core machine.
    # -a 7

    # output format [0,7-9=plain,xml,tsv,tsv+]. valid optinos are: 0, 7, 8, 9.
    # -m 7

    # strand. we want to search forward, and reverse. It takes longer, but we can be more sure.
    # -S 3

    # the output file
    # -o /uct/dev/source/smallBixTools/swipe_hiv_region_selection/swipe_outfile

    print("main")
    print("input file: %s" % in_fn)
    print("gene region: %s" % region)
    print("ft: %s" % ft)
    print("output matching file: %s" % out_good_fn)
    print("output non-matching file: %s" % out_bad_fn)
    print("\n\n\n")

    # first part:
    # TODO

    # swipe outfile: /uct/dev/source/smallBixTools/swipe_hiv_region_selection/swipe_outfile_m8
    #swipe_fn = "/uct/dev/source/smallBixTools/swipe_hiv_region_selection/swipe_outfile_m8"
    swipe_fn = "/uct/dev/source/smallBixTools/swipe_hiv_region_selection/swipe_outfile_m8_adjusted"
    swipe_fn = "/uct/Breakthrough/analysis/HVTN503/2016-07-07_data_transfer/29/demultiplex/swipe_pol_out_m8"
    # second part.
    # splitting a fasta file (based on swipe outfile) into "good" vs. "bad".

    dct = st.fasta_to_dct(in_fn)
    dct2 = {}
    for k, v in dct.items():
        dct2[k] = {
            'seq': v,
            'len': len(v),
        }

    print(len(dct))

    # parsing swipe output
    swipe_data = []
    with open(swipe_fn, "r") as fh:
        for line in fh:
            swipe_data.append(line.strip().split("\t"))
    print(len(swipe_data))
    print(swipe_data[0])

    longest_aln_swipe_dct = {}
    for row in swipe_data:
        seq_id = row[0]
        aln_len = row[3]
        if seq_id in longest_aln_swipe_dct.keys():
            if aln_len > longest_aln_swipe_dct[seq_id][3]:
                longest_aln_swipe_dct[seq_id] = row
        else:
            longest_aln_swipe_dct[seq_id] = row

    print(
        "we have selected only the alingments from the swipe results, where if there were two alignments, "
        "the longest is represented here.")
    print("len of this: %s" % (len(longest_aln_swipe_dct)))
    cleaned_swipe_data = []

    # get all the sequences from the fasta file where the alignment length is longer than 80% of read length
    longer_alignments = []
    shorter_alignments = []
    prcnt = 60
    for k, v in longest_aln_swipe_dct.items():
        seq_id = v[0]
        ident = v[2]
        aln_len = v[3]
        try:
            aln_len = int(aln_len)
        except Exception as e:
            print(e)
            sys.exit()

        seq_len = dct2[seq_id]['len']

        if (aln_len / seq_len) < prcnt / 100.0:
            shorter_alignments.append(v)
        else:
            longer_alignments.append(v)

    with open("short_alignments.fasta", "w") as fw:
        for row in shorter_alignments:
            fw.write(">" + row[0] + "\n" + dct[row[0]] + "\n")
    with open("longer_alignments.fasta", "w") as fw:
        for row in longer_alignments:
            fw.write(">" + row[0] + "\n" + dct[row[0]] + "\n")

    print(
        "of all the longest alignments, the number which were below %s percent alignmed are: %s"
        % (prcnt, len(shorter_alignments)))
    print("the number which were above %s percent aligned are: %s" %
          (prcnt, len(longer_alignments)))

    print("ending")
    sys.exit(0)
Ejemplo n.º 9
0
#!/anaconda/bin/python3.5

from smallBixTools import smallBixTools as st 

import os, sys 

filename = sys.argv[1]
outfile = filename+"subs.csv"


fastaDict = st.fasta_to_dct(filename)

seqID_list = [] 
# here we can create a list to keep track of what we have compared already.

# lets open a file for writing
# it can be in the same place as where we found the input file.
# we will use "os" to get the pieces of the filename.

# first we have to import os


# now we can use it
root, fn = os.path.split(filename)
# This will give us a tuple like this: ('/home/dave/scratch/aa_subs', 'example_inputfile.fasta')
# we can catch the returns as 2 variables:          root                      fn

# we can create a new output filename using the root part

# now that we have a filename to write to, we can create a handle. Python uses handles to read and write files on the filesystem.
filewriter_handle = open(outfile, "w") # we want to open this file for writing, so we use the "w". The other options are: "r" for reading, or "a" for appending.