def test_align_larger(self):
     valid = "data/YAL068C.fasta" 
     seq1, seq2 = parse_fasta.parse_fasta(valid).values()
     actual = align(seq1, seq2, get_scoring_matrix('default'))
     expected_score = 118
     self.assertEquals(actual[1], expected_score, "Alignment score is not correct")
     print_alignment(actual[0])
 def test_tenuous(self):
     valid = "data/tenuous.fasta" 
     seq1, seq2 = parse_fasta.parse_fasta(valid).values()
     expected_align = [('C', 'C', '|'), ('C', 'D', ':'), ('C', 'E', ':'), ('C', 'C', '|'), ('D', 'C', ':'),
                       ('E', 'C', ':'), ('C', 'C', '|')]
     expected_score = 3 
     actual = align(seq1, seq2, get_scoring_matrix('default'))
     self.assertEquals(actual[0], expected_align, "Alignment is not correct")
     self.assertEquals(actual[1], expected_score, "Alignment score is not correct")
     print_alignment(actual[0])
 def test_align(self):
     # get some valid data from a test file
     valid = "data/small.fasta" 
     seq1, seq2 = parse_fasta.parse_fasta(valid).values()
     expected_align = [('A', 'A', '|'), ('-', 'C', ' '), ('D', 'D', '|'), ('E', 'E', '|')]
     expected_score = 1
     actual = align(seq1, seq2, get_scoring_matrix('default'))
     self.assertEquals(actual[0], expected_align, "Alignment is not correct")
     self.assertEquals(actual[1], expected_score, "Alignment score is not correct")
     print_alignment(actual[0])
def transform_sequence_input(full_input, transform_function):
    output = []
    # check whether the input is fasta-formatted or just sequence
    # full_input is a list of lines, so convert it to string or this won't work
    if str(full_input).count('>')>0:    fasta = True
    else:                               fasta = False
    if debug: print '\n\t### INPUT:\n%s\t### END_INPUT\n'%full_input
    if fasta:
        for (header,seq) in parse_fasta.parse_fasta(full_input):
            output.append((header,transform_function(seq)))
    else:
        for line in full_input:
            output.append(transform_function(line))
    if debug:   print '\n\t######### FINAL OUTPUT: #########'%output
    return output
    for k, v in siteDict.items():
        if len(set(v)) == 1:
            continue
        else:
            var.append(k + 1)
    return var


if __name__ == "__main__":
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("alignment", help="FASTA-formatted alignment \
                        to extract variable sites")
    args = parser.parse_args()

    seqs = dict([x for x in parse_fasta(args.alignment)])
    sites = get_site_dict(seqs)
    # print(sites)
    variable = get_variable(sites)
    print(variable)

    for k, v in seqs.items():
        print(">" + k)
        outSeq = ""
        for i, j in enumerate(v):
            if i in variable:
                outSeq += j
        print(outSeq)
if __name__ == "__main__":
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("sequence", help="Alignment in FASTA format")
    parser.add_argument("flag1", help="Flag to recognise sequences \
                        in group 1. Must be first substring of name.")
    parser.add_argument("flag2", help="Flag to recognise sequences \
                        in group 2. Must be first substring of name.")
    parser.add_argument("-d", "--distance", help="distance \
                        metric to use [euc (default)/jsd]",
                        default="euc")
    args = parser.parse_args()

    seqs1 = dict([x for x in parse_fasta(args.sequence)
                 if x[0].startswith(args.flag1)])
    seqs2 = dict([x for x in parse_fasta(args.sequence)
                 if x[0].startswith(args.flag2)])
    cols1 = get_columns(seqs1)
    cols2 = get_columns(seqs2)
    colProps1 = calc_col_prop(cols1)
    # print(colProp1)
    colProps2 = calc_col_prop(cols2)
    # print(colProp2)
    if args.distance == "euc":
        dist = calc_euclidean(colProps1, colProps2)
    elif args.distance == "jsd":
        dist = calc_jsd_scipy(colProps1, colProps2)
    else:
        print("distance argument not recognised")
Esempio n. 7
0
import numpy as np

import minineedle  # https://github.com/scastlara/minineedle
import miniseq  # https://github.com/scastlara/miniseq

if __name__ == '__main__':

    with open('sets.json', 'r') as f:
        sets = json.load(f)

    sequences = dict()

    for dp, dn, filenames in os.walk('.'):
        for f in filenames:
            if f == 'sequence.fa':
                data = parse_fasta(os.path.join(dp, f))
                name = os.path.basename(dp)
                sequences[name] = data['sequences'][0]

    set_a = sets['training_set']
    set_b = sets['benchmark_set_membrane']
    homologies = np.empty((len(set_a), len(set_b)), dtype=np.float)
    for idx_a, a in enumerate(set_a):
        for idx_b, b in enumerate(set_b):
            seq_a = miniseq.Protein(a, sequences[a])
            seq_b = miniseq.Protein(b, sequences[b])
            alignment = minineedle.Needleman(seq_a, seq_b)
            alignment.align()
            seq_a, seq_b = alignment.alseq1, alignment.alseq2
            n = float(len(seq_a))
            identity = sum(i == j for i, j in zip(seq_a, seq_b)) / n
import sys
import argparse
from parse_fasta import parse_fasta


if __name__ == "__main__":
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    ap = argparse.ArgumentParser()
    ap.add_argument("-a", "--alignment", help="Empirical alignment, \
                    in FASTA format.")
    ap.add_argument("-s", "--simulated", help="Simulated alignment, \
                    in FASTA format.")
    args = ap.parse_args()

    emp_dict = dict([x for x in parse_fasta(args.alignment)])
    sim_dict = dict([x for x in parse_fasta(args.simulated)])

    for k, v in sim_dict.items():
        for n, _ in enumerate(v):
            try:
                if emp_dict[k][n] == "-":
                    sim_dict[k] = sim_dict[k][:n]+"-"+sim_dict[k][n+1:]
            except KeyError:
                sys.stderr.write(k + " not in simulated data, skipping")
                
    for k, v in sim_dict.items():
        print(">"+k)
        print(v)
Esempio n. 9
0
def analyze(seq1_filename: str, seq2_filename: str, nucleotides: bool = False):
    """
    Performs an alignment-based analysis on the sequences in the provided FASTA files.
    `nucleotides` should be `True` if the two filenames refer to nucleotide sequences.
    """
    # Read first sequence
    seq_name, seq1 = list(parse_fasta(get_data(seq1_filename)))[0]

    # Analyze second sequence
    print("Analyzing %s..." % seq2_filename)

    seq2 = list(parse_fasta(get_data(seq2_filename)))[0][1]
    alignment_result = align_sequences(seq2, seq1, nucleotides=nucleotides)

    largest_mismatch_pos, largest_mismatch = alignment_result.largest_mismatch(
    )
    percent_similarity = 1 - (alignment_result.hamming_distance() /
                              alignment_result.get_alignment_length())

    if nucleotides:
        trimmed_alignment_1, trimmed_alignment_2 = trim_for_dnds(
            alignment_result)
        dnds_ratio_data = [
            sliding_window_dnds(trimmed_alignment_1,
                                trimmed_alignment_2,
                                window_size=i) for i in DNDS_WINDOW_SIZES
        ]
        make_dnds_graph(f"{seq2_filename}_dnds_ratios.png", DNDS_WINDOW_SIZES,
                        dnds_ratio_data)

    for clusters in CLUSTER_COUNTS:
        clustered_mismatches = alignment_result.clustered_mismatches(
            cluster_count=clusters)
        clustered_mismatch_variance = alignment_result.clustered_mismatch_variance(
            cluster_count=clusters)

        # Output Supplementary Data 4
        make_output_dir()
        with open(get_output(f"{seq2_filename}_{clusters}.aln.txt"),
                  "w+") as f:
            f.write(alignment_result.format_result(line_length=100))

        with open(get_output(f"{seq2_filename}_{clusters}.meta.txt"),
                  "w+") as f:
            f.write(
                "Formatted metadata -- not for programmatic use.\n\n" +
                f"Information for alignment with {seq_name}:\n\n" +
                f"Percent similarity: {percent_similarity}\n" +
                f"Largest mismatch location: {largest_mismatch_pos}\n" +
                f"Largest mismatch size: {largest_mismatch}bp\n" +
                f"Variance between clusters ({clusters} clusters): {clustered_mismatch_variance}\n"
                + f"Clustered mismatches: {clustered_mismatches}\n")

        with open(get_output(f"{seq2_filename}_{clusters}.meta.json"),
                  "w+") as f:
            json_output = {
                "percent_similarity": percent_similarity,
                "largest_mismatch_pos": largest_mismatch_pos,
                "largest_mismatch": largest_mismatch,
                "clustered_mismatch_variance": clustered_mismatch_variance,
                "clustered_mismatches": clustered_mismatches,
            }

            json.dump(
                json_output,
                f,
            )

    make_cluster_graphs(seq2_filename, alignment_result)
Esempio n. 10
0
if __name__ == "__main__":
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("alignments",
                        help="alignments for each group, \
                        columns matching",
                        nargs="+")
    args = parser.parse_args()

    # print(len(args.alignments))

    alns = {}
    for a in args.alignments:
        alns[args.alignments.index(a)] = dict([x for x in parse_fasta(a)])
        lens = []
        for v in alns.values():
            lens.append(len(v))
            if len(set(lens)) > 1:
                print("alignments are not of the same length!")
                sys.exit()
    # print(alns)

    alnsCols = {}
    for k, v in alns.items():
        alnsCols[k] = get_columns(v)
    # print(alnsCols)

    colsProps = {}
    for k, v in alnsCols.items():
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("aln1", help="first group alignment")
    parser.add_argument("aln2",
                        help="second group alignment, \
                        columns matching")
    parser.add_argument("-d",
                        "--distance",
                        help="distance \
                        metric to use [euc (default)/jsd]",
                        default="euc")
    args = parser.parse_args()

    seqs1 = dict([x for x in parse_fasta(args.aln1)])
    seqs2 = dict([x for x in parse_fasta(args.aln2)])
    cols1 = get_columns(seqs1)
    cols2 = get_columns(seqs2)
    colProps1 = calc_col_prop(cols1)
    # print(colProps1)  # debug
    colProps2 = calc_col_prop(cols2)
    # print(colProps2)  # debug
    if args.distance == "euc":
        dist = calc_euclidean(colProps1, colProps2)
    elif args.distance == "jsd":
        dist = calc_jsd_scipy(colProps1, colProps2)
    else:
        print("distance argument not recognised")
        sys.exit()
    print("pos,dist")
Esempio n. 12
0

if __name__ == "__main__":
    # Parse arguments
    parser = argparse.ArgumentParser(
        description=
        "Run a Monte-Carlo simulation for CsTSI and CsGSI alignment.")
    parser.add_argument("--id", dest="simulation_id", type=int)
    parser.add_argument("--trials", dest="n_trials", type=int)
    args = parser.parse_args()

    simulation_id = args.simulation_id
    n_trials = args.n_trials

    # Read CsTSI sequence
    cstsi_seq = list(parse_fasta(get_data(CSTSI_PROTEIN)))[0][1]

    # Analyze CsGSI sequence
    print("Analyzing %s..." % CSGSI_PROTEIN)

    csgsi_seq = list(parse_fasta(get_data(CSGSI_PROTEIN)))[0][1]
    alignment_result = align_sequences(csgsi_seq, cstsi_seq, nucleotides=False)

    for clusters in CLUSTER_COUNTS:
        print(
            f"Variance between clusters ({clusters} clusters): {str(alignment_result.clustered_mismatch_variance(cluster_count=clusters))}"
        )

        # Simulate random sequences
        simulation_result = monte_carlo(
            get_clustering_simulation_fn(cstsi_seq, csgsi_seq),
Esempio n. 13
0
#! /usr/bin/python3

import os
import sys
from parse_fasta import parse_fasta

# extracts alignment columns given in args.

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python " + sys.argv[0] + " aln col1 col2 col3 ...")
        sys.exit(0)

    seqDict = dict([x for x in parse_fasta(sys.argv[1])])

    collist = [int(i) - 1 for i in sys.argv[2:]]
    # print collist
    for k in seqDict.keys():
        print(">" + k)
        print("".join([seqDict[k][i] for i in collist]))
def main():

    parser = argparse.ArgumentParser(description='CBioVikings Global Sequence Alignment Example')
    parser.add_argument('-f', '--fasta', 
                        required=True, 
                        dest='fasta', 
                        help='Fasta file to read sequences from') 
    parser.add_argument('-m', '--scoring_matrix', 
                        dest='scoring_matrix', 
                        help='Similarity matrix to use for scoring.  Default=identity')
    parser.add_argument('--debug', 
                        action='store_true', 
                        help='Print debug output')
    parser.add_argument('-v', '--verbose', 
                        action='store_true', 
                        dest='verbose', 
                        help='Print verbose output')
    args = parser.parse_args()

    if args.verbose:
        sys.stderr.write("# CBioVikings Global Sequence Alignment\n")
        sys.stderr.write("# fasta file: " + str(args.fasta) + "\n")
        sys.stderr.write("# scoring matrix: " + str(args.scoring_matrix) + "\n")
        if args.debug:
            sys.stderr.write("# debug is on\n")

    sequences = None
    scoring_matrix = None
    seq1 = None
    seq2 = None
    # read sequences from the fasta file provided
    if args.fasta:

        try:
            # sequences is a dictionary of the fasta entries
            sequences = parse_fasta.parse_fasta(args.fasta)
        except Exceptions.EmptyFasta:
            report_error("There are no sequences in the fasta file " + args.fasta)

        # our algorithm will only align two sequences
        if len(sequences.keys()) != 2:
            report_error("Fasta file does not contain 2 sequences")
        else:
            seq1, seq2 = sequences.values()

    # get the similarity scoring matrix to use
    if args.scoring_matrix:
        try:
            scoring_matrix = global_align.get_scoring_matrix(args.matrix)
        except Exceptions.MissingMatrixType:
            report_error("That matrix type is not available")
    else:
        try:
            scoring_matrix = global_align.get_scoring_matrix('default')
        except Exceptions.MissingMatrixType:
            report_error("Default matrix is not available")

    # now actually do the alignment
    if sequences and scoring_matrix:
        try:
            alignment, score = global_align.align(seq1, seq2, scoring_matrix)
        except:
            raise 

        print("Alignment Score is: "+str(score))
        # print the alignment
        if alignment:
            global_align.print_alignment(alignment)
Esempio n. 15
0
                    the gap reconstruction as fixed, i.e. preferring the \
                    gap state if PP >= 0.5")
    args = parser.parse_args()

    fastmlDir = args.fastml_dir
    if fastmlDir[-1] != "/":
        fastmlDir += "/"
    nodeFile = args.nodefile

    nodes = read_node_file(nodeFile)
    probFile = fastmlDir + "prob.marginal.csv"
    gapFile = fastmlDir + "Ancestral_MaxMarginalProb_Char_Indel.txt"
    indelProb = fastmlDir + "IndelsMarginalProb.txt"
    seqFile = fastmlDir + "seq.marginal_IndelAndChars.txt"
    if args.gapmode is None:
        gapMode = "prob"
    else:
        gapMode = args.gapmode

    seqs = dict([x for x in parse_fasta(seqFile)])

    for k, v in nodes.items():
        mapSeq = seqs[k]
        altAllDict = main(probFile, gapFile, indelProb, k, gapMode)
        altAllSeq = [x for x in altAllDict.values()][0]
        with open(v + ".pep.fa", "w") as outf:
            outf.write(">" + v + "_MAP\n")
            outf.write(mapSeq + "\n")
            outf.write(">" + v + "_AltAll\n")
            outf.write(altAllSeq + "\n")
Esempio n. 16
0
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--sequence", help="Alignment in FASTA format")
    parser.add_argument("-f1",
                        "--flag1",
                        help="Flag to recognise sequences \
                        in group 1. Must be first substring of name.")
    parser.add_argument("-f2",
                        "--flag2",
                        help="Flag to recognise sequences \
                        in group 2. Must be first substring of name.")
    args = parser.parse_args()

    seqs1 = dict(
        [x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag1)])
    seqs2 = dict(
        [x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag2)])
    cols1 = get_columns(seqs1)
    cols2 = get_columns(seqs2)
    colProp1 = calc_col_prop(cols1)
    # print(colProp1)
    colProp2 = calc_col_prop(cols2)
    # print(colProp2)
    dist = calc_euclidean(colProp1, colProp2)
    print("pos,dist")
    tmp = [(a + 1, b) for a, b in zip(sorted(dist, key=dist.get, reverse=True),
                                      sorted(dist.values(), reverse=True))]
    for i in tmp:
        print(str(i[0]) + "," + str(i[1]))
    return siteDict


if __name__ == "__main__":
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")

    parser = argparse.ArgumentParser()
    parser.add_argument("a1", help="master alignment, in FASTA")
    parser.add_argument("a2", help="alignment to compare, in FASTA")
    parser.add_argument("pa",
                        help="profile alignment of a1 and a2, \
                        in FASTA")
    args = parser.parse_args()

    aln1 = dict([x for x in parse_fasta(args.a1)])
    aln2 = dict([x for x in parse_fasta(args.a2)])
    proAln1 = dict([x for x in parse_fasta(args.pa)][:len(aln1)])
    proAln2 = dict([x for x in parse_fasta(args.pa)][len(aln1):])
    proAln1Sites = get_site_dict(proAln1)
    proAln2Sites = get_site_dict(proAln2)

    corres = []
    posAln1 = 1
    posAln2 = 1
    for i in range(len(proAln1Sites)):
        if all([x == "-" for x in proAln1Sites[i]]):  # gap in aln1
            corres.append((0, posAln2))
            posAln2 += 1
        else:  # char in aln1
            if all([x == "-" for x in proAln2Sites[i]]):  # gap in aln2
Esempio n. 18
0
def main():

    parser = argparse.ArgumentParser(
        description='CBioVikings Global Sequence Alignment Example')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        dest='fasta',
                        help='Fasta file to read sequences from')
    parser.add_argument(
        '-m',
        '--matrix',
        dest='matrix',
        help='Similarity matrix to use for scoring.  Default=identity')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Print debug output')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Print verbose output')
    args = parser.parse_args()

    if args.verbose:
        sys.stderr.write("# CBioVikings Global Sequence Alignment\n")
        sys.stderr.write("# fasta file: " + str(args.fasta) + "\n")
        sys.stderr.write("# matrix: " + str(args.matrix) + "\n")
        if args.debug:
            sys.stderr.write("# debug is on\n")

    # read sequences from the fasta file provided
    if args.fasta:
        try:
            # sequences is a dictionary of the fasta entries
            sequences = parse_fasta.parse_fasta(args.fasta)
        except:
            report_error("There are no sequences in the fasta file " +
                         args.fasta)

        # our algorithm will only align two sequences
        if len(sequences.keys()) != 2:
            report_error("Fasta file does not contain 2 sequences")
        else:
            seq1, seq2 = sequences.values()

    # TODO check if we have a nucleotide or protein fasta file

    # get the similarity matrix to use
    if args.matrix:
        try:
            matrix = global_align.get_matrix(args.matrix)
        except:
            report_error("That matrix type is not available")
    else:
        try:
            matrix = global_align.get_matrix('default')
        except:
            report_error("Default matrix is not available")

    # now actually do the alignment
    if sequences and matrix:
        try:
            alignment = global_align.align(seq1, seq2, matrix)
        except:
            raise  # TODO nice errors

        # print the alignment
        if alignment:
            global_align.print_alignment(alignment)
                        of site, node, state, and probability, in the style \
                        of FastML's Ancestral_MaxMarginalProb_Char_Indel.txt")
    args = parser.parse_args()

    if args.robust and args.probs is None:
        sys.stderr.write("must specify probability file (-p) to calculate " +
                         "robust substitutions\n")
        sys.exit()

    with open(args.tree, "r") as t:
        for s in t:
            s = s.strip()
            nwkString = s

    curroot = tree_reader.read_tree_string(nwkString)
    branches = get_anc_desc(curroot)

    seqs = dict([x for x in parse_fasta(args.sequences)])
    # print(seqs)

    if args.robust:
        probs = parse_probs(args.probs)
        add_subs_robust(branches, seqs, probs, args.gaps)
    else:
        add_subs(branches, seqs, args.gaps)

    print("parent\tchild\tsubs")
    print(curroot.label)
    for k, v in branches.items():
        print(k[0] + "\t" + k[1] + "\t" + ",".join(v))
Esempio n. 20
0
def main():
	all_peptides = parse_fasta("bigger.fasta") # is a dictionary in the form of ... mass -> [(string,suffixMasses),...]	
	sorted_masses_peptides = sorted(all_peptides.keys())
	parse_mgf("test.mgf",all_peptides,sorted_masses_peptides)