def __init__(self,alnFile1,alnFile2): name1, aln1 = read_fasta(alnFile1) name2, aln2 = read_fasta(alnFile2) self.name1 = name1 self.name2 = name2 self.aln1 = aln1 self.aln2 = aln2 self.alnFile1 = alnFile1 self.alnFile2 = alnFile2
def write_alignment(sequence_file,fout): taxon_names, seq_aln = read_fasta(sequence_file) for (taxon,seq) in zip(taxon_names,seq_aln): if taxon != "out": fout.write("\t\t<sequence>\n") fout.write("\t\t\t<taxon idref=\"" + taxon + "\"/>\n") fout.write("\t\t\t"+seq + "\n") fout.write("\t\t</sequence>\n")
def sub_merge(self,n1,n2): # randomly sample sequences from aln1 and aln2 subprocess.check_call(["python","utils/sampling.py",self.alnFile1,"temp1.fas",str(n1)]) subprocess.check_call(["python","utils/sampling.py",self.alnFile2,"temp2.fas",str(n2)]) # and call opal (or perhaps another merger) to merge them subprocess.check_call(["java","-Xmx256M","-jar","opal_2.1.3/Opal.jar","--in","temp1.fas","--in2","temp2.fas","--out","temp.fas"]) subprocess.check_call(["utils/stdFAS.py","temp.fas","merged.fas"]) name1,sub_aln1 = read_fasta("temp1.fas") name2,sub_aln2 = read_fasta("temp2.fas") name,sub_merged = read_fasta("merged.fas") subprocess.check_call(["rm","temp1.fas"]) subprocess.check_call(["rm","temp2.fas"]) subprocess.check_call(["rm","temp.fas"]) subprocess.check_call(["rm","merged.fas"]) return sub_aln1, sub_aln2, sub_merged
def main(): concated = {} L = 0 for seqfile in argv[1:-1]: print(seqfile) newNames, newSeqs = read_fasta(seqfile) L = add_one_aln(concated, L, newNames, newSeqs) names, seqs = print_concatenated(concated, L) write_fasta(argv[-1], names, seqs)
#! /usr/bin/env python from sys import argv from sequence_lib import read_fasta input_file = argv[1] names, sequences = read_fasta(input_file) total = 0 freq = {} for s in sequences: for c in s: if c != '-': total += 1 if not c in freq: freq[c] = 1 else: freq[c] = freq[c] + 1 for c in sorted(freq): print(c + " " + str(float(freq[c]) / total))
#! /usr/bin/env python from sequence_lib import read_fasta, write_fasta from sys import argv seqfile = argv[1] reducedFile = argv[2] # output identicalSeqsFile = argv[3] # output names,sequences = read_fasta(seqfile) sorted_seqs = sorted((s,i) for i,s in enumerate(sequences)) reduced_names = [names[sorted_seqs[0][1]]] reduced_seqs = [sorted_seqs[0][0]] prev_seq = sorted_seqs[0] L = len(sorted_seqs) i=1 found_identical = False first_write = True with open(identicalSeqsFile,"w") as f: while i<L: if sorted_seqs[i][0] == sorted_seqs[i-1][0]: if not found_identical: if not first_write: f.write("\n") else: first_write = False f.write(names[sorted_seqs[i-1][1]] + " ")
#! /usr/bin/env python from sequence_lib import read_fasta, p_distance from sys import argv seq_file = argv[1] names, aln = read_fasta(seq_file) d = 0 #count = 0 for i, a1 in enumerate(aln): for j, a2 in enumerate(aln[i + 1:]): d += p_distance(a1, a2) #count += 1 L = len(aln) print(2 * d / (L * (L - 1)))
#! /usr/bin/env python from sequence_lib import read_fasta, write_fasta from sys import argv infile=argv[1] outfile=argv[2] taxa,seqs = read_fasta(infile) new_seqs = [] for seq in seqs: new_seq = "".join([seq[i] for i in range(len(seq)) if i%3 != 2]) new_seqs.append(new_seq) write_fasta(outfile,taxa,new_seqs)
#! /usr/bin/env python # Usage: python mask_aln.py <file_in> <path_out> <mask_levels> from sys import argv import os.path from sequence_lib import count_gaps, read_fasta, write_fasta file_in = argv[1] path_in, file_name = os.path.split(file_in) path_out = path_in if (argv[2] == '-') else argv[2] base_name, ext = os.path.splitext(file_name) taxon_names, seq_aln = read_fasta(file_in) gap_count = count_gaps(seq_aln) N = len(gap_count) taxon_count = len(taxon_names) for msk_lev in argv[3:]: gap_limit = taxon_count * (1 - float(msk_lev)) chosen_cols = [i for i in range(N) if gap_count[i] <= gap_limit] msk_aln = [""] * taxon_count output_file = path_out + "/" + base_name + "_msk" + str(msk_lev) + ext for j in chosen_cols: for i in range(taxon_count): msk_aln[i] = msk_aln[i] + seq_aln[i][j] write_fasta(output_file, taxon_names, msk_aln)
#! /usr/bin/env python from sys import argv from sequence_lib import read_fasta, write_fasta from random import sample inputfile = argv[1] outputfile = argv[2] nsites = int(argv[3]) seq_names, seq_aln = read_fasta(inputfile) sites = sorted(sample(range(len(seq_aln[0])),nsites)) new_aln = [] for a in seq_aln: b = '' for i in sites: b = b + a[i] new_aln.append(b) write_fasta(outputfile,seq_names,new_aln)
mapping, idx, remove_gaps=True): for i, taxon in enumerate(taxon_names): seq_len = len([x for x in sequences[i] if x != '-']) if remove_gaps else len(sequences[i]) if taxon not in mapping: mapping[taxon] = [(idx, seq_len)] else: mapping[taxon].append((idx, seq_len)) mapping = {} for idx, filename in enumerate(argv[1:]): taxon_names, sequences = read_fasta(filename) report_sequence_length(taxon_names, sequences, mapping, idx) max_idx = len(argv) - 1 for taxon in mapping: string = taxon arr = mapping[taxon] i = 0 for idx, seq_len in arr: while i < idx: string += " 0" i += 1 string += (" " + str(seq_len)) i += 1 while i < max_idx: