def test_align_larger(self): valid = "data/YAL068C.fasta" seq1, seq2 = parse_fasta.parse_fasta(valid).values() actual = align(seq1, seq2, get_scoring_matrix('default')) expected_score = 118 self.assertEquals(actual[1], expected_score, "Alignment score is not correct") print_alignment(actual[0])
def test_tenuous(self): valid = "data/tenuous.fasta" seq1, seq2 = parse_fasta.parse_fasta(valid).values() expected_align = [('C', 'C', '|'), ('C', 'D', ':'), ('C', 'E', ':'), ('C', 'C', '|'), ('D', 'C', ':'), ('E', 'C', ':'), ('C', 'C', '|')] expected_score = 3 actual = align(seq1, seq2, get_scoring_matrix('default')) self.assertEquals(actual[0], expected_align, "Alignment is not correct") self.assertEquals(actual[1], expected_score, "Alignment score is not correct") print_alignment(actual[0])
def test_align(self): # get some valid data from a test file valid = "data/small.fasta" seq1, seq2 = parse_fasta.parse_fasta(valid).values() expected_align = [('A', 'A', '|'), ('-', 'C', ' '), ('D', 'D', '|'), ('E', 'E', '|')] expected_score = 1 actual = align(seq1, seq2, get_scoring_matrix('default')) self.assertEquals(actual[0], expected_align, "Alignment is not correct") self.assertEquals(actual[1], expected_score, "Alignment score is not correct") print_alignment(actual[0])
def transform_sequence_input(full_input, transform_function): output = [] # check whether the input is fasta-formatted or just sequence # full_input is a list of lines, so convert it to string or this won't work if str(full_input).count('>')>0: fasta = True else: fasta = False if debug: print '\n\t### INPUT:\n%s\t### END_INPUT\n'%full_input if fasta: for (header,seq) in parse_fasta.parse_fasta(full_input): output.append((header,transform_function(seq))) else: for line in full_input: output.append(transform_function(line)) if debug: print '\n\t######### FINAL OUTPUT: #########'%output return output
for k, v in siteDict.items(): if len(set(v)) == 1: continue else: var.append(k + 1) return var if __name__ == "__main__": if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("alignment", help="FASTA-formatted alignment \ to extract variable sites") args = parser.parse_args() seqs = dict([x for x in parse_fasta(args.alignment)]) sites = get_site_dict(seqs) # print(sites) variable = get_variable(sites) print(variable) for k, v in seqs.items(): print(">" + k) outSeq = "" for i, j in enumerate(v): if i in variable: outSeq += j print(outSeq)
if __name__ == "__main__": if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("sequence", help="Alignment in FASTA format") parser.add_argument("flag1", help="Flag to recognise sequences \ in group 1. Must be first substring of name.") parser.add_argument("flag2", help="Flag to recognise sequences \ in group 2. Must be first substring of name.") parser.add_argument("-d", "--distance", help="distance \ metric to use [euc (default)/jsd]", default="euc") args = parser.parse_args() seqs1 = dict([x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag1)]) seqs2 = dict([x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag2)]) cols1 = get_columns(seqs1) cols2 = get_columns(seqs2) colProps1 = calc_col_prop(cols1) # print(colProp1) colProps2 = calc_col_prop(cols2) # print(colProp2) if args.distance == "euc": dist = calc_euclidean(colProps1, colProps2) elif args.distance == "jsd": dist = calc_jsd_scipy(colProps1, colProps2) else: print("distance argument not recognised")
import numpy as np import minineedle # https://github.com/scastlara/minineedle import miniseq # https://github.com/scastlara/miniseq if __name__ == '__main__': with open('sets.json', 'r') as f: sets = json.load(f) sequences = dict() for dp, dn, filenames in os.walk('.'): for f in filenames: if f == 'sequence.fa': data = parse_fasta(os.path.join(dp, f)) name = os.path.basename(dp) sequences[name] = data['sequences'][0] set_a = sets['training_set'] set_b = sets['benchmark_set_membrane'] homologies = np.empty((len(set_a), len(set_b)), dtype=np.float) for idx_a, a in enumerate(set_a): for idx_b, b in enumerate(set_b): seq_a = miniseq.Protein(a, sequences[a]) seq_b = miniseq.Protein(b, sequences[b]) alignment = minineedle.Needleman(seq_a, seq_b) alignment.align() seq_a, seq_b = alignment.alseq1, alignment.alseq2 n = float(len(seq_a)) identity = sum(i == j for i, j in zip(seq_a, seq_b)) / n
import sys import argparse from parse_fasta import parse_fasta if __name__ == "__main__": if len(sys.argv[1:]) == 0: sys.argv.append("-h") ap = argparse.ArgumentParser() ap.add_argument("-a", "--alignment", help="Empirical alignment, \ in FASTA format.") ap.add_argument("-s", "--simulated", help="Simulated alignment, \ in FASTA format.") args = ap.parse_args() emp_dict = dict([x for x in parse_fasta(args.alignment)]) sim_dict = dict([x for x in parse_fasta(args.simulated)]) for k, v in sim_dict.items(): for n, _ in enumerate(v): try: if emp_dict[k][n] == "-": sim_dict[k] = sim_dict[k][:n]+"-"+sim_dict[k][n+1:] except KeyError: sys.stderr.write(k + " not in simulated data, skipping") for k, v in sim_dict.items(): print(">"+k) print(v)
def analyze(seq1_filename: str, seq2_filename: str, nucleotides: bool = False): """ Performs an alignment-based analysis on the sequences in the provided FASTA files. `nucleotides` should be `True` if the two filenames refer to nucleotide sequences. """ # Read first sequence seq_name, seq1 = list(parse_fasta(get_data(seq1_filename)))[0] # Analyze second sequence print("Analyzing %s..." % seq2_filename) seq2 = list(parse_fasta(get_data(seq2_filename)))[0][1] alignment_result = align_sequences(seq2, seq1, nucleotides=nucleotides) largest_mismatch_pos, largest_mismatch = alignment_result.largest_mismatch( ) percent_similarity = 1 - (alignment_result.hamming_distance() / alignment_result.get_alignment_length()) if nucleotides: trimmed_alignment_1, trimmed_alignment_2 = trim_for_dnds( alignment_result) dnds_ratio_data = [ sliding_window_dnds(trimmed_alignment_1, trimmed_alignment_2, window_size=i) for i in DNDS_WINDOW_SIZES ] make_dnds_graph(f"{seq2_filename}_dnds_ratios.png", DNDS_WINDOW_SIZES, dnds_ratio_data) for clusters in CLUSTER_COUNTS: clustered_mismatches = alignment_result.clustered_mismatches( cluster_count=clusters) clustered_mismatch_variance = alignment_result.clustered_mismatch_variance( cluster_count=clusters) # Output Supplementary Data 4 make_output_dir() with open(get_output(f"{seq2_filename}_{clusters}.aln.txt"), "w+") as f: f.write(alignment_result.format_result(line_length=100)) with open(get_output(f"{seq2_filename}_{clusters}.meta.txt"), "w+") as f: f.write( "Formatted metadata -- not for programmatic use.\n\n" + f"Information for alignment with {seq_name}:\n\n" + f"Percent similarity: {percent_similarity}\n" + f"Largest mismatch location: {largest_mismatch_pos}\n" + f"Largest mismatch size: {largest_mismatch}bp\n" + f"Variance between clusters ({clusters} clusters): {clustered_mismatch_variance}\n" + f"Clustered mismatches: {clustered_mismatches}\n") with open(get_output(f"{seq2_filename}_{clusters}.meta.json"), "w+") as f: json_output = { "percent_similarity": percent_similarity, "largest_mismatch_pos": largest_mismatch_pos, "largest_mismatch": largest_mismatch, "clustered_mismatch_variance": clustered_mismatch_variance, "clustered_mismatches": clustered_mismatches, } json.dump( json_output, f, ) make_cluster_graphs(seq2_filename, alignment_result)
if __name__ == "__main__": if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("alignments", help="alignments for each group, \ columns matching", nargs="+") args = parser.parse_args() # print(len(args.alignments)) alns = {} for a in args.alignments: alns[args.alignments.index(a)] = dict([x for x in parse_fasta(a)]) lens = [] for v in alns.values(): lens.append(len(v)) if len(set(lens)) > 1: print("alignments are not of the same length!") sys.exit() # print(alns) alnsCols = {} for k, v in alns.items(): alnsCols[k] = get_columns(v) # print(alnsCols) colsProps = {} for k, v in alnsCols.items():
if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("aln1", help="first group alignment") parser.add_argument("aln2", help="second group alignment, \ columns matching") parser.add_argument("-d", "--distance", help="distance \ metric to use [euc (default)/jsd]", default="euc") args = parser.parse_args() seqs1 = dict([x for x in parse_fasta(args.aln1)]) seqs2 = dict([x for x in parse_fasta(args.aln2)]) cols1 = get_columns(seqs1) cols2 = get_columns(seqs2) colProps1 = calc_col_prop(cols1) # print(colProps1) # debug colProps2 = calc_col_prop(cols2) # print(colProps2) # debug if args.distance == "euc": dist = calc_euclidean(colProps1, colProps2) elif args.distance == "jsd": dist = calc_jsd_scipy(colProps1, colProps2) else: print("distance argument not recognised") sys.exit() print("pos,dist")
if __name__ == "__main__": # Parse arguments parser = argparse.ArgumentParser( description= "Run a Monte-Carlo simulation for CsTSI and CsGSI alignment.") parser.add_argument("--id", dest="simulation_id", type=int) parser.add_argument("--trials", dest="n_trials", type=int) args = parser.parse_args() simulation_id = args.simulation_id n_trials = args.n_trials # Read CsTSI sequence cstsi_seq = list(parse_fasta(get_data(CSTSI_PROTEIN)))[0][1] # Analyze CsGSI sequence print("Analyzing %s..." % CSGSI_PROTEIN) csgsi_seq = list(parse_fasta(get_data(CSGSI_PROTEIN)))[0][1] alignment_result = align_sequences(csgsi_seq, cstsi_seq, nucleotides=False) for clusters in CLUSTER_COUNTS: print( f"Variance between clusters ({clusters} clusters): {str(alignment_result.clustered_mismatch_variance(cluster_count=clusters))}" ) # Simulate random sequences simulation_result = monte_carlo( get_clustering_simulation_fn(cstsi_seq, csgsi_seq),
#! /usr/bin/python3 import os import sys from parse_fasta import parse_fasta # extracts alignment columns given in args. if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: python " + sys.argv[0] + " aln col1 col2 col3 ...") sys.exit(0) seqDict = dict([x for x in parse_fasta(sys.argv[1])]) collist = [int(i) - 1 for i in sys.argv[2:]] # print collist for k in seqDict.keys(): print(">" + k) print("".join([seqDict[k][i] for i in collist]))
def main(): parser = argparse.ArgumentParser(description='CBioVikings Global Sequence Alignment Example') parser.add_argument('-f', '--fasta', required=True, dest='fasta', help='Fasta file to read sequences from') parser.add_argument('-m', '--scoring_matrix', dest='scoring_matrix', help='Similarity matrix to use for scoring. Default=identity') parser.add_argument('--debug', action='store_true', help='Print debug output') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Print verbose output') args = parser.parse_args() if args.verbose: sys.stderr.write("# CBioVikings Global Sequence Alignment\n") sys.stderr.write("# fasta file: " + str(args.fasta) + "\n") sys.stderr.write("# scoring matrix: " + str(args.scoring_matrix) + "\n") if args.debug: sys.stderr.write("# debug is on\n") sequences = None scoring_matrix = None seq1 = None seq2 = None # read sequences from the fasta file provided if args.fasta: try: # sequences is a dictionary of the fasta entries sequences = parse_fasta.parse_fasta(args.fasta) except Exceptions.EmptyFasta: report_error("There are no sequences in the fasta file " + args.fasta) # our algorithm will only align two sequences if len(sequences.keys()) != 2: report_error("Fasta file does not contain 2 sequences") else: seq1, seq2 = sequences.values() # get the similarity scoring matrix to use if args.scoring_matrix: try: scoring_matrix = global_align.get_scoring_matrix(args.matrix) except Exceptions.MissingMatrixType: report_error("That matrix type is not available") else: try: scoring_matrix = global_align.get_scoring_matrix('default') except Exceptions.MissingMatrixType: report_error("Default matrix is not available") # now actually do the alignment if sequences and scoring_matrix: try: alignment, score = global_align.align(seq1, seq2, scoring_matrix) except: raise print("Alignment Score is: "+str(score)) # print the alignment if alignment: global_align.print_alignment(alignment)
the gap reconstruction as fixed, i.e. preferring the \ gap state if PP >= 0.5") args = parser.parse_args() fastmlDir = args.fastml_dir if fastmlDir[-1] != "/": fastmlDir += "/" nodeFile = args.nodefile nodes = read_node_file(nodeFile) probFile = fastmlDir + "prob.marginal.csv" gapFile = fastmlDir + "Ancestral_MaxMarginalProb_Char_Indel.txt" indelProb = fastmlDir + "IndelsMarginalProb.txt" seqFile = fastmlDir + "seq.marginal_IndelAndChars.txt" if args.gapmode is None: gapMode = "prob" else: gapMode = args.gapmode seqs = dict([x for x in parse_fasta(seqFile)]) for k, v in nodes.items(): mapSeq = seqs[k] altAllDict = main(probFile, gapFile, indelProb, k, gapMode) altAllSeq = [x for x in altAllDict.values()][0] with open(v + ".pep.fa", "w") as outf: outf.write(">" + v + "_MAP\n") outf.write(mapSeq + "\n") outf.write(">" + v + "_AltAll\n") outf.write(altAllSeq + "\n")
if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("-s", "--sequence", help="Alignment in FASTA format") parser.add_argument("-f1", "--flag1", help="Flag to recognise sequences \ in group 1. Must be first substring of name.") parser.add_argument("-f2", "--flag2", help="Flag to recognise sequences \ in group 2. Must be first substring of name.") args = parser.parse_args() seqs1 = dict( [x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag1)]) seqs2 = dict( [x for x in parse_fasta(args.sequence) if x[0].startswith(args.flag2)]) cols1 = get_columns(seqs1) cols2 = get_columns(seqs2) colProp1 = calc_col_prop(cols1) # print(colProp1) colProp2 = calc_col_prop(cols2) # print(colProp2) dist = calc_euclidean(colProp1, colProp2) print("pos,dist") tmp = [(a + 1, b) for a, b in zip(sorted(dist, key=dist.get, reverse=True), sorted(dist.values(), reverse=True))] for i in tmp: print(str(i[0]) + "," + str(i[1]))
return siteDict if __name__ == "__main__": if len(sys.argv[1:]) == 0: sys.argv.append("-h") parser = argparse.ArgumentParser() parser.add_argument("a1", help="master alignment, in FASTA") parser.add_argument("a2", help="alignment to compare, in FASTA") parser.add_argument("pa", help="profile alignment of a1 and a2, \ in FASTA") args = parser.parse_args() aln1 = dict([x for x in parse_fasta(args.a1)]) aln2 = dict([x for x in parse_fasta(args.a2)]) proAln1 = dict([x for x in parse_fasta(args.pa)][:len(aln1)]) proAln2 = dict([x for x in parse_fasta(args.pa)][len(aln1):]) proAln1Sites = get_site_dict(proAln1) proAln2Sites = get_site_dict(proAln2) corres = [] posAln1 = 1 posAln2 = 1 for i in range(len(proAln1Sites)): if all([x == "-" for x in proAln1Sites[i]]): # gap in aln1 corres.append((0, posAln2)) posAln2 += 1 else: # char in aln1 if all([x == "-" for x in proAln2Sites[i]]): # gap in aln2
def main(): parser = argparse.ArgumentParser( description='CBioVikings Global Sequence Alignment Example') parser.add_argument('-f', '--fasta', required=True, dest='fasta', help='Fasta file to read sequences from') parser.add_argument( '-m', '--matrix', dest='matrix', help='Similarity matrix to use for scoring. Default=identity') parser.add_argument('--debug', action='store_true', help='Print debug output') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Print verbose output') args = parser.parse_args() if args.verbose: sys.stderr.write("# CBioVikings Global Sequence Alignment\n") sys.stderr.write("# fasta file: " + str(args.fasta) + "\n") sys.stderr.write("# matrix: " + str(args.matrix) + "\n") if args.debug: sys.stderr.write("# debug is on\n") # read sequences from the fasta file provided if args.fasta: try: # sequences is a dictionary of the fasta entries sequences = parse_fasta.parse_fasta(args.fasta) except: report_error("There are no sequences in the fasta file " + args.fasta) # our algorithm will only align two sequences if len(sequences.keys()) != 2: report_error("Fasta file does not contain 2 sequences") else: seq1, seq2 = sequences.values() # TODO check if we have a nucleotide or protein fasta file # get the similarity matrix to use if args.matrix: try: matrix = global_align.get_matrix(args.matrix) except: report_error("That matrix type is not available") else: try: matrix = global_align.get_matrix('default') except: report_error("Default matrix is not available") # now actually do the alignment if sequences and matrix: try: alignment = global_align.align(seq1, seq2, matrix) except: raise # TODO nice errors # print the alignment if alignment: global_align.print_alignment(alignment)
of site, node, state, and probability, in the style \ of FastML's Ancestral_MaxMarginalProb_Char_Indel.txt") args = parser.parse_args() if args.robust and args.probs is None: sys.stderr.write("must specify probability file (-p) to calculate " + "robust substitutions\n") sys.exit() with open(args.tree, "r") as t: for s in t: s = s.strip() nwkString = s curroot = tree_reader.read_tree_string(nwkString) branches = get_anc_desc(curroot) seqs = dict([x for x in parse_fasta(args.sequences)]) # print(seqs) if args.robust: probs = parse_probs(args.probs) add_subs_robust(branches, seqs, probs, args.gaps) else: add_subs(branches, seqs, args.gaps) print("parent\tchild\tsubs") print(curroot.label) for k, v in branches.items(): print(k[0] + "\t" + k[1] + "\t" + ",".join(v))
def main(): all_peptides = parse_fasta("bigger.fasta") # is a dictionary in the form of ... mass -> [(string,suffixMasses),...] sorted_masses_peptides = sorted(all_peptides.keys()) parse_mgf("test.mgf",all_peptides,sorted_masses_peptides)