def annotate_ref(msa_fobj, msa_informat, outfile_fobj, split, outformat): msa_data = AlignIO.read(msa_fobj, msa_informat) align1 = [] align2 = [] outfile_fobj2 = open(outfile_fobj.name + '.2', 'w') align1 = msa_data[:split] align2 = msa_data[split:] AlignIO.write(AlignIO.MultipleSeqAlignment(align1), outfile_fobj, outformat) AlignIO.write(AlignIO.MultipleSeqAlignment(align2), outfile_fobj2, outformat)
def simulate(q, lock): while True: sim_no, e = q.get() if e is None: break e(seqfile=None, ratefile=None, infofile=None) seq_dict = e.get_sequences() seq_tuple = sorted(seq_dict.items()) align = AlignIO.MultipleSeqAlignment( SeqRecord( Seq(seqstr, generic_dna), id=tax, description='', ) for (tax, seqstr) in seq_tuple) out_align = os.path.join( out_dir, '%(pre)s_%(sim)d.fasta' % { "pre": args.prefix, "sim": sim_no }) AlignIO.write(align, out_align, "fasta") lock.acquire() try: print "Process %d reporting: wrote alignment %d" % (os.getpid(), sim_no) finally: lock.release() q.task_done()
def split_alignment(alignment_file, proteins): ''' in: path to fasta-format alignment file, [(gene, (start,end)), ...] out: separate .phyx-format (relaxed phylip) alignment files for each gene, based on provided coordinates NB: removes any sequences without > 70% non-gap sites from each segment alignment ''' align = AlignIO.read(open(alignment_file, 'r'), 'fasta') ofile_stem = alignment_file.split('/')[-1].split('.')[ 0] # name output like alignmentfilename_protein.phyx ofile_list = [] for protein, (start, end) in proteins: ofile_name = ofile_stem + '_%s.phyx' % (protein) ofile_list.append(ofile_name) start, end = start - 1, end - 1 # adjust for pythonic coordinates align_segment = align[:, start:end + 1] #[allrows, startcolumn:endcolumn] endcolumn += 1 for inclusive slicing filtered_align_segment = AlignIO.MultipleSeqAlignment([]) for seq in align_segment: seq.seq.data = str(seq.seq).replace('n', '-').replace('N', '-') if float(str(seq.seq).count('-')) / float( len(str(seq.seq)) ) <= 0.30: # Require at least 70% of sites are not gaps to include in segment alignment filtered_align_segment.append(seq) AlignIO.write(filtered_align_segment, ofile_name, 'phylip-relaxed') return ofile_list
def reduce_msa_to_seqs_by_name(msa, keep_names_lst): new_msa = [] all_names = [rec.id for rec in list(msa)] for name in keep_names_lst: new_msa.append(msa[all_names.index(name), :]) #remove positions that are just gaps after removal of sequences new_msa = remove_nonACTGU_sites(AlignIO.MultipleSeqAlignment(new_msa)) return new_msa
def run_muscle(fasta_file, out_file=None, muscle_params='', reorder=True): """ beware, muscle does not keep sequence order and the --stable switch is broken :param fasta_file: :param out_file: :param muscle_params: :param reorder: :return: """ ml.info('Running muscle.') ml.debug(fname()) if out_file: cl_file = out_file else: cl_fd, cl_file = mkstemp(prefix='rba_', suffix='_07', dir=CONFIG.tmpdir) os.close(cl_fd) cmd = [ '{}muscle'.format(CONFIG.muscle_path), '-clwstrict', '-seqtype', 'rna', '-out', cl_file, '-in', fasta_file, '-quiet' ] if muscle_params != '': cmd += [' '.join([shlex.quote(i) for i in shlex.split(muscle_params)])] ml.debug(cmd) with TemporaryFile(mode='w+', encoding='utf-8') as tmp: r = call(cmd, stdout=tmp, stderr=tmp) if r: msgfail = 'Call to muscle failed.' ml.error(msgfail) tmp.seek(0) raise exceptions.MuscleException(msgfail, tmp.read()) if reorder: # reorder sequences acording to input file with open(fasta_file, 'r') as ff, open(cl_file, 'r+') as oo: orig_seqs = [i.id for i in SeqIO.parse(ff, format='fasta')] muscle_align = { i.id: i for i in AlignIO.read(oo, format='clustal') } # reorder reo_alig = [] for s_name in orig_seqs: # muscle cuts names reo_alig.append(muscle_align[s_name[:32]]) alig = AlignIO.MultipleSeqAlignment(reo_alig) # write oo.seek(0) AlignIO.write(alig, oo, format='clustal') oo.truncate() return cl_file
def test_first_sequence_in_is_first_sequence_out(self): alignment = AlignIO.MultipleSeqAlignment([ SeqRecord(Seq("TTTT")), SeqRecord(Seq("AAAA")), SeqRecord(Seq("CC-C")), ]) result = get_interval_seqs(alignment) expected = ["TTTT", "AAAA", "CCC"] self.assertEqual(expected, result)
def catAln(alns): alphabet = alns[0][0, :].seq.alphabet catSeqs = list() for i in range(len(alns[0])): catSeq = list() for j in range(len(alns)): catSeq.append(str(alns[j][i].seq)) catSeq = ''.join(catSeq) catSeqs.append(catSeq) result = AlignIO.MultipleSeqAlignment( SeqIO.SeqRecord(Seq(catSeqs[x], alphabet=alphabet), id=alns[0][x, :].id) for x in range(0, len(catSeqs))) return (result)
def _try_rescue(profile_file): # beware AlignIO truncates sequence names so they become non-unique, then clustalo also fails ml.warning( 'Trying rescue for profile alignment if profile has no gaps, sequences appears not aligned. ' 'Appending trailing gap to overcome the issue.') a = AlignIO.read(profile_file, format='clustal') s = [SeqRecord(Seq(str(i.seq) + '-'), id=i.id) for i in a] fa = AlignIO.MultipleSeqAlignment(s) fd, temp = mkstemp(prefix='rba_', suffix='_56', dir=CONFIG.tmpdir) with os.fdopen(fd, 'w') as fh: AlignIO.write(fa, fh, format='fasta') return temp
def run_raf(raf_params): (rna_file_path, raf_output_file_path, raf_output_file_path_2) = raf_params raf_command = "raf predict " + rna_file_path (output, _, _) = utils.run_command(raf_command) raf_output_file = open(raf_output_file_path, "w+") raf_output_file.write(output.decode()) raf_output_file.close() sta = AlignIO.read(raf_output_file_path, "fasta") recs = sta[:-1] new_sta = AlignIO.MultipleSeqAlignment(recs) new_sta.column_annotations["secondary_structure"] = str(sta[-1].seq) AlignIO.write(new_sta, raf_output_file_path, "stockholm") AlignIO.write(new_sta, raf_output_file_path_2, "clustal")
def trimaln(aln, target_ids, gaps=0.9): # Read the alignemnt into biopython structure aln = AlignIO.read(StringIO(aln), 'fasta') dfaln = pd.DataFrame(aln) dfaln.index = [x.id for x in aln] # get only the targets aln = AlignIO.MultipleSeqAlignment([x for x in aln if x.id in target_ids]) nseqs = len(aln) c = pd.DataFrame(aln).apply(lambda x: sum(x == '-') / nseqs, axis=0) dfaln = dfaln.loc[:, c[c < gaps].index] e = dfaln.apply(lambda x: sum(x == '-') / dfaln.shape[1], axis=1) dfaln = dfaln[e < gaps].drop_duplicates() l = ['>%s\n%s\n' % (x[0], ''.join(x[1]).upper().strip()) for x in dfaln.iterrows()] return '\n'.join(l).replace('\n\n', '\n')
def format_concatenated_alignment(): logger = logging.getLogger(__name__) strain_names_map = build_strain_names_map() tree_alignment = AlignIO.read( open(os.path.join(DATA_DIR, "all_alignments"), "r"), FASTA_FILE_TYPE) tree_alignment_filtered = AlignIO.MultipleSeqAlignment([]) for id, strain in zip(range(STRAINS_COUNT), tree_alignment): if all(c == '-' for c in strain.seq): logger.info("skipping filtered strain %d" % id) else: logger.info("adding id to strain %d" % id) strain.id = "[" + str(id) + "]" + strain_names_map[id] strain.description = '' tree_alignment_filtered.append(strain) AlignIO.write(tree_alignment_filtered, open(os.path.join(DATA_DIR, "filtered_tree_alignment"), "w"), FASTA_FILE_TYPE)
def visual_check(self): while True: vca = VisualAlleleCheck(suptitle=self.subdir.name, **self.__dict__) if self.seq_discarded: while self.seq_discarded: vca.selected.append(self.seq_discarded.pop()) vca.change_rect_color() vca.show() if len(vca.selected) > 0: ds = ", ".join( [self.seqdat[i].id for i in sorted(vca.selected)]) msg = "Would you like to discard {} sequence{} ({})?".format( len(vca.selected), "s" if len(vca.selected) > 1 else "", ds) if ask_user(msg, default="y", quit=True): self.seqdat = [ self.seqdat[i] for i in range(len(self.seqdat)) if i not in vca.selected ] self.align = AlignIO.MultipleSeqAlignment([ self.align[i] for i in range(len(self.align)) if i not in vca.selected ]) # remove gaps self.align = remove_gap_pos(self.align) self.ssr_regions, self.motifs = find_variable_ssrs( self.align, **self.kwargs) # add rep_data to seqdat self.add_rep_data() if len(self.align) > 1: self.tree = construct_tree(self.align, self.ssr_regions, self.motifs) else: self.tree = None print("Reconstructing phylogeny ...") else: msg = "Keep all sequences and write results?" if ask_user(msg, default="y", quit=True): break
def cut_alignment(input_fasta: str, output_file: str, begin: int, end: int) -> None: with open(input_fasta) as handle: alignment = AlignIO.read(handle, "fasta") length = alignment.get_alignment_length() new_length = end - begin if new_length < 0: raise RuntimeError("End position must be higher than the begin position") if begin + new_length > length: raise RuntimeError("Alignment is too short") new_records = [] for record in alignment: new_records.append(record[begin:end]) new_alignment = AlignIO.MultipleSeqAlignment(new_records) AlignIO.write(new_alignment, output_file, "fasta")
def annotate_ref(msa_fobj, map_fobj, msa_informat, outfile_fobj, msa_outformat): map_data = dict( [line.split("\t") for line in map_fobj.read().splitlines() if line]) msa_data = AlignIO.read(msa_fobj, msa_informat) annotated_align = [] for align in msa_data: seq_id = align.id try: genus, species = map_data[align.id].split(" ", 1) #annotation = ''.join([genus[0]+'.',species,"[{}]".format(seq_id)]) #annotation = "{}[{}]".format(map_data[align.id],seq_id) annotation = "{} {}".format(genus, species) align.id = annotation except KeyError: pass annotated_align.append(align) AlignIO.write(AlignIO.MultipleSeqAlignment(annotated_align), outfile_fobj, msa_outformat)
def main(commandline_args): comm_args = create_and_parse_argument_options(commandline_args) alignment_file = read_align(comm_args.alignment_path) sliced_alignments = slice_by_name(alignment_file) first_aln = sorted(list(sliced_alignments.keys()))[0] slided_scores = {} #Sliding increment -> (scores,alignment objects) for i in range(0, sliced_alignments[first_aln].get_alignment_length(), comm_args.window): #print(i) second_aln = AlignIO.MultipleSeqAlignment([]) for record in sliced_alignments[sorted(list( sliced_alignments.keys()))[1]]: second_aln.append(record) #Reorders an alignment group using the specified window size reordered_aln = sliced_alignments[first_aln][:, -( sliced_alignments[first_aln].get_alignment_length() - i):] + sliced_alignments[first_aln][:, :i] for record in reordered_aln: second_aln.append(record) alnindex_score, gapped_sliced_alns, number_of_aligned_positions, gp_mapping = TwinCons.main( ['-as', format(second_aln, "fasta"), '-r', '-mx', 'blosum62']) out_dict = {} for x in alnindex_score.keys(): out_dict[x] = alnindex_score[x][0] slided_scores[i] = out_dict for file in slided_scores: print("Increment is " + str(file)) alnindex = sorted(slided_scores[file].keys()) posdata, negdata = uninterrupted_stretches(alnindex, slided_scores[file], comm_args) for x in sorted(posdata.keys()): print(x, posdata[x])
unknown = False # If alignment contains sequence with unknown amino acids proteins = [] # Proteins in filtered alignment species = set() # Species in filtered alignment records = [ ] # Records to generate filtered alignment (for later re-alignment) # Iterate through records for i, record in enumerate(MSA): if 'X' in record.seq.upper(): unknown = True else: proteins.append(record.name) species.add(record.name.split('.')[0]) records.append(record) MSA = AlignIO.MultipleSeqAlignment( records ) # Re-assign MSA to remove sequences with unknown amino acids if unknown and len( MSA ) > 1: # Re-align if unknown flag is True and more than sequence is present args = [ 'xvfb-run', '/home/singlemd/miniconda3/envs/ete3/bin/ete3', 'build', '-w', 'eggnog41', '-a', f'out/raw/{alignment_id}/unaligned.fa', '-o', f'out/raw/{alignment_id}', '--dealign', '--cpu', '4' ] path_aligned = f'out/raw/{alignment_id}/metaligner_trimmed-trimal01-prottest_default-phyml_default/unaligned.fa.final_tree.fa' # Create directory to store ete3 input and output
def test_ambiguous_bases_one_seq_with_repeated_base(self): alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RRAAT"))]) result = get_interval_seqs(alignment) expected = {"GAAAT", "AAAAT", "GGAAT", "AGAAT"} self.assertEqual(set(result), expected)
def main(args): # Load metadata and generate strain - gisaid id dictionary. md = pd.read_csv(args.metadata, sep="\t") print("Entries in metadata: {}".format(len(md))) md_gisaid_list = md["gisaid_epi_isl"].to_list() gisaid_dict = md.loc[~md["gisaid_epi_isl"].isna(), ["strain", "gisaid_epi_isl"]] gisaid_dict = gisaid_dict.set_index("gisaid_epi_isl") gisaid_dict = gisaid_dict["strain"].to_dict() # Load alignment alignment = AlignIO.read(args.alignment, "fasta") print("Sequences in alignment: {}".format(len(alignment))) alignment_list = [i.name for i in alignment] # Load tree tree = Tree.get(path=args.tree, schema="newick") print("Leaves in tree: {}".format(len(tree.taxon_namespace))) # Determine leaves which names cannot be assigned. tree_leaves = [i.label for i in tree.taxon_namespace] tree_leaves = [i.replace(" ", "_") for i in tree_leaves] leaf_missing_md = np.setdiff1d(tree_leaves, md_gisaid_list) # Remove leaves identified print("Leaves in tree but not in metadata: {}".format( len(leaf_missing_md))) tree = tree.extract_tree_without_taxa_labels( [i.replace("_", " ") for i in leaf_missing_md]) tree.purge_taxon_namespace() print("Leaves in tree after pruning: {}".format(len(tree.taxon_namespace))) # Rename leaves to match metadata and alignment print("Renaming leaves to match metadata and alignment... ", end="") leaves = list() for i in tree.taxon_namespace: try: i.label = gisaid_dict[i.label.replace(" ", "_")] except KeyError: pass leaves.append(i.label) print("Done") # Remove leaves that aren't in alignment leaf_missing_align = np.setdiff1d(leaves, alignment_list) print("Leaves in tree but not in alignment: {}".format( len(leaf_missing_align))) tree = tree.extract_tree_without_taxa_labels(leaf_missing_align) tree.purge_taxon_namespace() print("Leaves in tree after pruning: {}".format(len(tree.taxon_namespace))) # Update tree_leaves list tree_leaves = [i.label for i in tree.taxon_namespace] # Filter alignment to tips in tree tree_alignment = list() for i in alignment: if i.name in tree_leaves: tree_alignment.append(i) tree_alignment = AlignIO.MultipleSeqAlignment(tree_alignment) # Filter metadata to tips in tree tree_md = md.loc[md["strain"].isin([i.name for i in tree_alignment])] # Filter metadata and alignment to query query_md = md.loc[md["interest"] == "interest"] interests = query_md["strain"].to_list() query_alignment = [i for i in alignment if i.name in interests] query_alignment = AlignIO.MultipleSeqAlignment(query_alignment) # Write files to disk tree.write(path=os.path.join(args.outdir, "global.tree"), schema="newick") AlignIO.write(tree_alignment, os.path.join(args.outdir, "alignment.fasta"), "fasta") tree_md.to_csv(os.path.join(args.outdir, "metadata.csv"), index=False) AlignIO.write(query_alignment, os.path.join(args.outdir, "query.fasta"), "fasta") query_md.to_csv(os.path.join(args.outdir, "query.csv"), index=False)
parser.add_argument('-o','--outfile', type=argparse.FileType('w'), help ="MSA output file", required=True) parser.add_argument('-v','--outformat', default="clustal", help ="MSA output format") parser.add_argument('-l','--list', type=argparse.FileType('r'), required = True) args = parser.parse_args() sel_seqs = args.list.read().split() msa_data = AlignIO.read(args.msa_file, args.informat) sel_align = [] found_align = [] for align in msa_data: if align.id in sel_seqs: sel_align.append(align) found_align.append(align.id) AlignIO.write(AlignIO.MultipleSeqAlignment(sel_align), args.outfile, args.outformat) [ sys.write("Not found:\t{}".format(align.id)) for seq_id in found_align if seq_id not in found_align ]
i = np.random.choice(range(len(current_species))) a = current_species[i] a.gene_duplication() a.speciation( ) # event of speciation in which two new species diverge from the previous one. current_species = [ x for x in sp_tree.nodes() if sp_tree.out_degree(x) == 0 ] # updates the "leaves" in species tree. leaves = [x for x in seq_tree.nodes() if seq_tree.out_degree(x) == 0 ] # updates the "leaves" in sequences tree. if len(orthologs) < args.n_ort: print('Warning: few sequences to have %d ortholog groups!' % args.n_ort) ###==================================================================================================== colection = AlignIO.MultipleSeqAlignment([ SeqRecord(Seq(seq.sequence), id=str(seq)) for seq in sequences.colection ]) AlignIO.write( colection, open('%s_all_sequences.%s' % (args.out, args.msa_format), 'w'), args.msa_format) alignment = AlignIO.MultipleSeqAlignment(build_MSA(seq_tree, first_seq)) AlignIO.write( alignment, open('%s_current_sequences.%s' % (args.out, args.msa_format), 'w'), args.msa_format) tree = Phylo.BaseTree.Tree(root=build_tree(seq_tree, first_seq), rooted=True) Phylo.write(tree, '%s_gene_tree.%s' % (args.out, args.tree_format), args.tree_format) cladogram = Phylo.BaseTree.Tree(root=build_tree(sp_tree, first_sp,
def divergence(fastain, patient_id, cutoff): # fasta = open('%s' % filename, 'r') split_fasta = split(fastain, 1) seqs_by_timepoint = split_fasta[0] total_seq = split_fasta[1] # conseq = consensus.seq[(sites_pos[0]-1):(sites_pos[1]-1)] # conseq = Seq(str(consensus).replace('-','N')) # consensus = Seq(conseq.seq.tostring().replace('-','N')) # seq_length = len(consensus) mean_divergence = [] median_divergence = [] lower_divergence_25 = [] upper_divergence_75 = [] lower_divergence_5 = [] upper_divergence_95 = [] divergence_std = [] mean_N_divergence = [] median_N_divergence = [] lower_N_divergence_25 = [] upper_N_divergence_75 = [] lower_N_divergence_5 = [] upper_N_divergence_95 = [] N_divergence_std = [] mean_S_divergence = [] median_S_divergence = [] lower_S_divergence_25 = [] upper_S_divergence_75 = [] lower_S_divergence_5 = [] upper_S_divergence_95 = [] S_divergence_std = [] dN = [] dN_med = [] dN_lower_25 = [] dN_upper_75 = [] dN_lower_5 = [] dN_upper_95 = [] dN_std = [] dS = [] dS_med = [] dS_lower_25 = [] dS_upper_75 = [] dS_lower_5 = [] dS_upper_95 = [] dS_std = [] patient = [] # parts = str.split(fastain, "/") # parts2 = str.split(parts[len(parts)-1], "_") patient.append(patient_id) nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None) sorted_timepoints = seqs_by_timepoint.keys() sorted_timepoints.sort(key=natural_keys) print sorted_timepoints first_timepoint = AlignIO.MultipleSeqAlignment( seqs_by_timepoint[sorted_timepoints[0]]) consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus( threshold=0.01).upper() conseq = Seq(str(consensus).replace('X', 'N')) prot = "" if "gag" in fastain: prot = "gag" else: prot = "gp41" sampleTimes = [] for t in sorted_timepoints: sampleTimes.append(float(t)) # for f in filelist: for t in range(0, len(sorted_timepoints)): divergence = [] divergence_N = [] divergence_S = [] divergence_dN = [] divergence_dS = [] # diff = 0 seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]] seq_length = len(seqs_at_t[0].seq) seq_freq = get_seq_freq(seqs_at_t) seqs_at_t_array = np.asarray(seqs_at_t) # i want to calculate derived freq wrt to consequence not minor freq per site #for c in xrange(0,len(consensus_seqs)): full_der_freq = [] total_site_freq = [] for i in range(seq_length): site_a = seqs_at_t_array[:, i] anc_freq = 0 der_freq = 0 #gap_count = "".join(site_a).count('-') for j in range(0, len(seq_freq)): if site_a[j] != '-': if conseq[i].lower() == site_a[j]: anc_freq += seq_freq[j] else: der_freq += seq_freq[j] # if (site_a[j] == 'a'): # A += seq_freq[j] # elif (site_a[j] == 'c'): # C += seq_freq[j] # elif (site_a[j] == 't'): # T += seq_freq[j] # elif (site_a[j] == 'g'): # G += seq_freq[j] total_seq = sum([der_freq, anc_freq]) full_der_freq.append(der_freq) total_site_freq.append(total_seq) #print [der_freq, anc_freq], total_seq #total_site_freq_per_consensus.append(total_site_freq) #full_der_freq_per_consensus.append(full_der_freq) #for c in xrange(0, len(consensus_seqs)): for i in range(seq_length): # print i, full_der_freq[i], patient_id, sorted_timepoints[t], total_seq, float( # full_der_freq[i]) / float(total_seq) diff = 0 diff_N = 0 diff_S = 0 count = total_site_freq[i] count1 = 0 if full_der_freq[i] > cutoff * total_seq: for each in seqs_at_t: parts = str.split(each.name, "_") freq = int(parts[2].strip()) seq = Seq(str(each.seq).upper().replace('-', 'N')) if (str(conseq[i]) != "N"): if (str(seq[i]) != "N"): count1 += freq if (conseq[i] != seq[i]): codon = [] if (i % 3 == 0): cp = i cp_a = i + 1 cp_b = i + 2 codon = [cp, cp_a, cp_b] elif (i % 3 == 1): cp_a = i - 1 cp = i cp_b = i + 1 codon = [cp_a, cp, cp_b] else: cp_a = i - 2 cp_b = i - 1 cp = i codon = [cp_a, cp_b, cp] consensus_aa = conseq[codon[0]:( codon[2] + 1)].translate() current_aa = seq[codon[0]:(codon[2] + 1)].translate() # print(str(consensus_aa), str(current_aa)) if 'X' in conseq[codon[0]:(codon[2] + 1)]: break if (str(consensus_aa) != str(current_aa)): diff_N += freq else: diff_S += freq #print i, current_aa, consensus_aa, diff_N, diff_S, each.name, freq diff += freq #print each.name, sorted_timepoints[t], "d", float(diff), i, seq_length, count print(count, count1, i, diff, diff_N, diff_S) # # if((count-count1) != 0): # print(count, count1, i, diff, diff_N, diff_S) if count > 0: #print i, patient_id, diff, count divergence.extend([float(diff) / float(count)]) divergence_N.extend([float(diff_N) / float(count)]) divergence_S.extend([float(diff_S) / float(count)]) divergence_dN.extend( [float(diff_N) / float(nonsyn_sites) / float(count)]) divergence_dS.extend( [float(diff_S) / float(syn_sites) / float(count)]) if len(divergence) > 1: mean_divergence.append(np.mean(divergence)) median_divergence.append(np.percentile(divergence, 50)) lower_divergence_25.append(np.percentile(divergence, 25)) upper_divergence_75.append(np.percentile(divergence, 75)) lower_divergence_5.append(np.percentile(divergence, 5)) upper_divergence_95.append(np.percentile(divergence, 95)) divergence_std.append(np.std(divergence)) mean_N_divergence.append(np.mean(divergence_N)) median_N_divergence.append(np.percentile(divergence_N, 50)) lower_N_divergence_25.append(np.percentile(divergence_N, 25)) upper_N_divergence_75.append(np.percentile(divergence_N, 75)) lower_N_divergence_5.append(np.percentile(divergence_N, 5)) upper_N_divergence_95.append(np.percentile(divergence_N, 95)) N_divergence_std.append(np.std(divergence_N)) mean_S_divergence.append(np.mean(divergence_S)) median_S_divergence.append(np.percentile(divergence_S, 50)) lower_S_divergence_25.append(np.percentile(divergence_S, 25)) upper_S_divergence_75.append(np.percentile(divergence_S, 75)) lower_S_divergence_5.append(np.percentile(divergence_S, 5)) upper_S_divergence_95.append(np.percentile(divergence_S, 95)) S_divergence_std.append(np.std(divergence_S)) dN.append(np.mean(divergence_dN)) dN_med.append(np.percentile(divergence_dN, 50)) dN_lower_25.append(np.percentile(divergence_dN, 25)) dN_upper_75.append(np.percentile(divergence_dN, 75)) dN_lower_5.append(np.percentile(divergence_dN, 5)) dN_upper_95.append(np.percentile(divergence_dN, 95)) dN_std.append(np.std(divergence_dN)) dS.append(np.mean(divergence_dS)) dS_med.append(np.percentile(divergence_dS, 50)) dS_lower_25.append(np.percentile(divergence_dS, 25)) dS_upper_75.append(np.percentile(divergence_dS, 75)) dS_lower_5.append(np.percentile(divergence_dS, 5)) dS_upper_95.append(np.percentile(divergence_dS, 95)) dS_std.append(np.std(divergence_dS)) if ("gag" in fastain): csvfile_gag_b.write(patient_id + "," + str(sorted_timepoints[t]) + "," + str(np.mean(divergence)) + "," + str(np.percentile(divergence, 50)) + "," + str(np.percentile(divergence, 5)) + "," + str(np.percentile(divergence, 95)) + "," + str(np.mean(divergence_N)) + "," + str(np.percentile(divergence_N, 50)) + "," + str(np.percentile(divergence_N, 5)) + "," + str(np.percentile(divergence_N, 95)) + "," + str(np.mean(divergence_S)) + "," + str(np.percentile(divergence_S, 50)) + "," + str(np.percentile(divergence_S, 5)) + "," + str(np.percentile(divergence_S, 95)) + "\n") csvfile_gag_b.flush() elif ("gp41" in fastain): csvfile_gp41_b.write( patient_id + "," + str(sorted_timepoints[t]) + "," + str(np.mean(divergence)) + "," + str(np.percentile(divergence, 50)) + "," + str(np.percentile(divergence, 5)) + "," + str(np.percentile(divergence, 95)) + "," + str(np.mean(divergence_N)) + "," + str(np.percentile(divergence_N, 50)) + "," + str(np.percentile(divergence_N, 5)) + "," + str(np.percentile(divergence_N, 95)) + "," + str(np.mean(divergence_S)) + "," + str(np.percentile(divergence_S, 50)) + "," + str(np.percentile(divergence_S, 5)) + "," + str(np.percentile(divergence_S, 95)) + "\n") else: print "xxx", patient_id, sorted_timepoints[t] print patient_id, sorted_timepoints[t], len(divergence)
def divergence(fastain, translate, date_part, patient_id, sites): seqs_by_timepoint = split(fastain, date_part) mean_divergence = [] median_divergence = [] lower_divergence_25 = [] upper_divergence_75 = [] lower_divergence_5 = [] upper_divergence_95 = [] divergence_std = [] mean_N_divergence = [] median_N_divergence = [] lower_N_divergence_25 = [] upper_N_divergence_75 = [] lower_N_divergence_5 = [] upper_N_divergence_95 = [] N_divergence_std = [] mean_S_divergence = [] median_S_divergence = [] lower_S_divergence_25 = [] upper_S_divergence_75 = [] lower_S_divergence_5 = [] upper_S_divergence_95 = [] S_divergence_std = [] dN = [] dN_med = [] dN_lower_25 = [] dN_upper_75 = [] dN_lower_5 = [] dN_upper_95 = [] dN_std = [] dS = [] dS_med = [] dS_lower_25 = [] dS_upper_75 = [] dS_lower_5 = [] dS_upper_95 = [] dS_std = [] patient = [] # parts = str.split(fastain, "/") # parts2 = str.split(parts[len(parts)-1], "_") patient.append(patient_id) nonsyn_sites, syn_sites = number_of_N_and_S_sites(fastain, None) print nonsyn_sites, syn_sites sorted_timepoints = seqs_by_timepoint.keys() sorted_timepoints.sort(key=natural_keys) print sorted_timepoints first_timepoint = AlignIO.MultipleSeqAlignment( seqs_by_timepoint[sorted_timepoints[0]]) consensus = AlignInfo.SummaryInfo(first_timepoint).dumb_consensus( threshold=0.01).upper() sampleTimes = [] for t in sorted_timepoints: sampleTimes.append(float(t)) #for f in filelist: for t in range(0, len(sorted_timepoints)): divergence = [] divergence_N = [] divergence_S = [] divergence_dN = [] divergence_dS = [] # diff = 0 seqs_at_t = seqs_by_timepoint[sorted_timepoints[t]] for each in seqs_at_t: parts = str.split(each.name, "_") freq = 1 diff = 0 diff_N = 0 diff_S = 0 seq = Seq(str(each.seq).upper().replace('-', 'N'))[sites[0]:sites[1]] codon_pos_start = 0 codon_pos_end = 2 A_i = str(seq).find('A') T_i = str(seq).find('T') G_i = str(seq).find('G') C_i = str(seq).find('C') start = [A_i, T_i, G_i, C_i] A_ii = str(seq).rfind('A') T_ii = str(seq).rfind('T') G_ii = str(seq).rfind('G') C_ii = str(seq).rfind('C') end = [A_ii, T_ii, G_ii, C_ii] start_i = min(start) end_i = max(end) if start_i > -1 and end_i > -1: # print start_i, end_i remainder_1 = start_i % 3 remainder_2 = end_i % 3 if remainder_1 != 0: b = remainder_1 != codon_pos_start # print start_i, start_i + (3-remainder_1) start_i = start_i + (3 - remainder_1) if remainder_2 != 2: # tprint end_i, end_i + (3-remainder_2) end_i = end_i + (2 - remainder_2) seq = seq[start_i:end_i + 1] gaps = str(seq).count('N') seq_length = len(seq) aa_length = seq_length / 3 conseq = Seq(str(consensus).replace('X', 'N'))[sites[0]:sites[1]] translated_seq = seq.translate() gaps_con = str(conseq).count('N') if gaps_con == seq_length: print("all gaps in conseq") break else: # if (seq_length >= length and (float(gaps) / float(seq_length)) < 0.05 and # (float(gaps_con) / float(seq_length)) < 0.05): # print translated_seq, conseq.translate(), count = 0 if (translate): seq = each.seq.translate() # count +=1 for a in range(seq_length): i = a if (str(conseq[i]) != "N"): if (str(seq[i]) != "N"): count = count + 1 if (conseq[i] != seq[i]): codon = [] if (i % 3 == 0): cp = i cp_a = i + 1 cp_b = i + 2 codon = [cp, cp_a, cp_b] elif (i % 3 == 1): cp_a = i - 1 cp = i cp_b = i + 1 codon = [cp_a, cp, cp_b] else: cp_a = i - 2 cp_b = i - 1 cp = i codon = [cp_a, cp_b, cp] consensus_aa = conseq[codon[0]:( codon[2] + 1)].translate() current_aa = seq[codon[0]:(codon[2] + 1)].translate() #print(str(consensus_aa), str(current_aa)) if 'X' in conseq[codon[0]:(codon[2] + 1)]: break if (str(consensus_aa) != str(current_aa)): diff_N += 1 else: diff_S += 1 diff += 1 # print diff/count, diff/seq_length divergence.extend([float(diff) / float(count)] * freq) divergence_N.extend([float(diff_N) / float(count)] * freq) divergence_S.extend([float(diff_S) / float(count)] * freq) divergence_dN.extend( [float(diff_N) / float(nonsyn_sites)] * freq) divergence_dS.extend([float(diff_S) / float(syn_sites)] * freq) if len(divergence) < 100: mean_divergence.append(float('nan')) median_divergence.append(float('nan')) lower_divergence_25.append(float('nan')) upper_divergence_75.append(float('nan')) lower_divergence_5.append(float('nan')) upper_divergence_95.append(float('nan')) divergence_std.append(float(1000)) mean_N_divergence.append(float('nan')) median_N_divergence.append(float('nan')) lower_N_divergence_25.append(float('nan')) upper_N_divergence_75.append(float('nan')) lower_N_divergence_5.append(float('nan')) upper_N_divergence_95.append(float('nan')) N_divergence_std.append(float(1000)) mean_S_divergence.append(float('nan')) median_S_divergence.append(float('nan')) lower_S_divergence_25.append(float('nan')) upper_S_divergence_75.append(float('nan')) lower_S_divergence_5.append(float('nan')) upper_S_divergence_95.append(float('nan')) S_divergence_std.append(float(1000)) dN.append(float('nan')) dN_med.append(float('nan')) dN_lower_25.append(float('nan')) dN_upper_75.append(float('nan')) dN_lower_5.append(float('nan')) dN_upper_95.append(float('nan')) dN_std.append(float('nan')) dS.append(float('nan')) dS_med.append(float('nan')) dS_lower_25.append(float('nan')) dS_upper_75.append(float('nan')) dS_lower_5.append(float('nan')) dS_upper_95.append(float('nan')) dS_std.append(float(1000)) else: #print divergence mean_divergence.append(np.mean(divergence)) median_divergence.append(np.percentile(divergence, 50)) lower_divergence_25.append(np.percentile(divergence, 25)) upper_divergence_75.append(np.percentile(divergence, 75)) lower_divergence_5.append(np.percentile(divergence, 5)) upper_divergence_95.append(np.percentile(divergence, 95)) divergence_std.append(np.std(divergence)) mean_N_divergence.append(np.mean(divergence_N)) median_N_divergence.append(np.percentile(divergence_N, 50)) lower_N_divergence_25.append(np.percentile(divergence_N, 25)) upper_N_divergence_75.append(np.percentile(divergence_N, 75)) lower_N_divergence_5.append(np.percentile(divergence_N, 5)) upper_N_divergence_95.append(np.percentile(divergence_N, 95)) N_divergence_std.append(np.std(divergence_N)) mean_S_divergence.append(np.mean(divergence_S)) median_S_divergence.append(np.percentile(divergence_S, 50)) lower_S_divergence_25.append(np.percentile(divergence_S, 25)) upper_S_divergence_75.append(np.percentile(divergence_S, 75)) lower_S_divergence_5.append(np.percentile(divergence_S, 5)) upper_S_divergence_95.append(np.percentile(divergence_S, 95)) S_divergence_std.append(np.std(divergence_S)) dN.append(np.mean(divergence_dN)) dN_med.append(np.percentile(divergence_dN, 50)) dN_lower_25.append(np.percentile(divergence_dN, 25)) dN_upper_75.append(np.percentile(divergence_dN, 75)) dN_lower_5.append(np.percentile(divergence_dN, 5)) dN_upper_95.append(np.percentile(divergence_dN, 95)) dN_std.append(np.std(divergence_dN)) dS.append(np.mean(divergence_dS)) dS_med.append(np.percentile(divergence_dS, 50)) dS_lower_25.append(np.percentile(divergence_dS, 25)) dS_upper_75.append(np.percentile(divergence_dS, 75)) dS_lower_5.append(np.percentile(divergence_dS, 5)) dS_upper_95.append(np.percentile(divergence_dS, 95)) dS_std.append(np.std(divergence_dS)) df = pd.DataFrame.from_items([ ('Times', sampleTimes), ('Divergence_median', median_divergence), ('Divergence_mean', mean_divergence), ('Divergence_N_med', median_N_divergence), ('Divergence_N_mean', mean_N_divergence), ('Divergence_S_med', median_S_divergence), ('Divergence_S_mean', mean_S_divergence), ('Divergence_lower_25', lower_divergence_25), ('Divergence_upper_75', upper_divergence_75), ('Divergence_lower_5', lower_divergence_5), ('Divergence_upper_95', upper_divergence_95), ("Divergence_std", divergence_std), ('Divergence_N_lower_25', lower_N_divergence_25), ('Divergence_N_upper_75', upper_N_divergence_75), ('Divergence_N_lower_5', lower_N_divergence_5), ('Divergence_N_upper_95', upper_N_divergence_95), ("Divergence_N_std", N_divergence_std), ('Divergence_S_lower_25', lower_S_divergence_25), ('Divergence_S_upper_75', upper_S_divergence_75), ('Divergence_S_lower_5', lower_S_divergence_5), ('Divergence_S_upper_95', upper_S_divergence_95), ("Divergence_S_std", S_divergence_std), ('Patients', patient * len(sampleTimes)) ]) # csvfilename = filename.replace("filelist_", "divergence_results_by_birth_patristic_sites_"+str(sites_pos[0])+"_to_"+str(sites_pos[1])+"_") # csvfilename = csvfilename.replace(".txt",".csv") # df.to_csv(csvfilename) df = df.dropna() # print sampleTimes - (np.ones(len(sampleTimes))*sampleTimes[0]) # print mean_divergence # print divergence_std if len(df) == 1: total_div = [ 'total', float('nan'), float('nan'), float('nan'), float('nan'), float('nan') ] N_div = [ 'N', float('nan'), float('nan'), float('nan'), float('nan'), float('nan') ] S_div = [ 'S', float('nan'), float('nan'), float('nan'), float('nan'), float('nan') ] return df
if __name__ == '__main__': # DIRS input_dir = '2_alignments' output_dir = '3_supermatrix' # MATCH IDS INTO SINGLE DICTIONARY print part_names seqdict, part_names = getSeqDict(parts, part_names) # CONSTRUCT SUPERMATRIX supermatrix, ngaps = getSupermatrix(seqdict, parts, part_names) print len(supermatrix) print ngaps ngaps_psp = float(ngaps) / len(supermatrix) # GET PARTITIONS partition_text = getPartitions(parts, part_names) # OUTPUT alignment = AlignIO.MultipleSeqAlignment(supermatrix) print( 'Supermatix of [{0}] length and [{1}] species generated with [{2}] \ gaps per species'.format(alignment.get_alignment_length(), len(alignment), ngaps_psp)) outfile = os.path.join(input_dir, 'supermatrix.phy') with open(outfile, "w") as f: # write out using PhylipWriter in order to extend id_width AlignIO.PhylipIO.PhylipWriter(f).write_alignment(alignment, id_width=100) # OUTPUT PARITIONS if partition_text: outfile = os.path.join(input_dir, 'partitions.txt') with open(outfile, 'w') as file: file.write(partition_text)
def plot_pairwise_diff(fastain, window_size): AA = [ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' ] print(len(AA)) aln = AlignIO.read('%s' % fastain, 'fasta') trans_aln = [] for i in range(len(aln)): trans_aln.append( SeqIO.SeqRecord( Seq(str(aln[i].seq).replace('-', 'N')).translate())) trans_aln = AlignIO.MultipleSeqAlignment(trans_aln) seq_length = len(aln[1, :]) n_windows = seq_length / window_size midpoint = [] pwd = [] raw_diff = [] window_no = [] count = 1 end = window_size sliding_window_size = 100 for each in range(n_windows): start = each * window_size end = (each + 1) * window_size print start, end, each + 1 sub_aln = trans_aln[:, each] aa_freq = [] for a in AA: align_array = np.array(sub_aln, np.str) aa_freq.append(str(align_array).count(a)) total_aa_freq = sum(aa_freq) diff = 0.0 for i in range(0, len(aa_freq)): freq_i = float(aa_freq[i]) / float(total_aa_freq) print(AA[i], freq_i) for j in range((i + 1), len(aa_freq)): freq_j = float(aa_freq[j]) / float(total_aa_freq - 1) diff += freq_i * freq_j #print diff pwd.append(diff) window_no.append(each + 1) return {"codon_no": window_no, "pwd": pwd}
def cut_fasta(input_file): ''' Input - name of file with alignment in fasta-format Finds positions without gaps in reference sequence (the first in alignment) Removes the columns with gaps in reference sequence from alignment Saves new alignment to the file 'input_file_name_cut.fasta' ''' alignment = AlignIO.read(open(input_file), "fasta") # alignment object temp_seq = alignment[0].seq # template - reference sequence positions = [] # list of positions without gaps in reference sequence pos_st = 0 # start position of block without gaps in reference seq pos_end = 0 # end position of block without gaps in reference seq count = 0 #legnth of block prev = '' #previous nucleotide #if k==1, we found block of gaps k = 0 #searching blocks without gaps for nuc in temp_seq: if nuc == '-' and prev != '-': pos_end = count if pos_end != 0: positions.append([pos_st, pos_end]) k = 1 if nuc != '-' and prev == '-': pos_st = count k = 0 count += 1 prev = nuc if k == 0: positions.append([pos_st, len(temp_seq)]) print(positions) # if no gaps in reference seq if len(positions) == 0: alignment1 = alignment # cutting regions without gaps else: # alignment1 = alignment[:,positions[0][0]:(positions[len(positions)-1][1])] alignment1 = alignment[:, positions[0][0]:(positions[0][1])] for i in range(1, len(positions)): alignment1 = alignment1 + alignment[:, positions[i][0]: (positions[i][1])] # If more than 10% of sequence are gaps, the sequence is deleted alignment_l = [] for rec in alignment1: count_gap = rec.seq.count('-') if count_gap / len(rec.seq) < 0.10: alignment_l.append(rec) alignment_new = AlignIO.MultipleSeqAlignment(alignment_l) print('Number of sequences in alignment {}'.format(len(alignment_new))) # print( "Alignment length {0}".format(alignment_new.get_alignment_length())) out_file = str(os.path.splitext(input_file)[0] + '_cut.fasta') # out_file = '.'.join(input_file.split('.')[:-1]) + '_cut.fasta' AlignIO.write(alignment_new, open(out_file, 'w'), "fasta")
# Iterate through all seqs in the alignment, splitting contigs where necessary. SeqsForOutput = [] for seq in AlignedSeqs: if seq.id in ContigsFound: if args.trim_overhangs: TrimmedSeq = "-" * FirstRefStart + str(seq.seq)[FirstRefStart:LastRefEnd \ + 1] + "-" * (AlignmentLength - LastRefEnd - 1) assert len(TrimmedSeq) == AlignmentLength, \ "Internal malfunction of overhang trimming" seq.seq = Seq(TrimmedSeq) SeqsForOutput += list( split_parts(seq, args.min_contig_size, args.split_gap_size)) else: SeqsForOutput.append(seq) # It's possible that after splitting contigs and imposing a minimum length # threshold, there are no contigs left. Exit with status 3 - shiver's reserved # non-zero exit status to indicate a lack of HIV data. NumContigs = len(SeqsForOutput) - NumRefSeqs if NumContigs == 0: print("After splitting contigs at gaps of length at least", args.split_gap_size, "and discarding contigs of length less than " + \ str(args.min_contig_size) + ", no contigs were left. Quitting.", file=sys.stderr) exit(3) # Remove pure-gap columns and print the output. OutputAlignment = AlignIO.MultipleSeqAlignment(SeqsForOutput) OutputAlignment = RemoveBlankColumns(OutputAlignment) AlignIO.write(OutputAlignment, sys.stdout, 'fasta')
i += 1 break positions = list() if os.path.exists(args.pos): with open(args.pos, 'r') as pos_file: positions = [ return_range(pos) for pos in pos_file.readline().strip().split(',') ] else: positions = [return_range(pos) for pos in args.pos.split(',')] # flatten list positions = [pos for element in positions for pos in element] # prepare alignment positions aln_pos = [ref_pos.get(rpos, None) for rpos in positions] # extracted records e_records = [] for record in aln: seq = ''.join([record.seq[apos] for apos in aln_pos]) e_records.append(SeqRecord(Seq(seq), id=record.id, description='')) aln_extract = AlignIO.MultipleSeqAlignment(e_records) with open(args.out, 'w') as out_file: AlignIO.write(aln_extract, out_file, args.out_fmt)
def test_ambiguous_bases_one_seq(self): alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RWAAT"))]) result = get_expanded_sequences(alignment) expected = {"GAAAT", "AAAAT", "GTAAT", "ATAAT"} self.assertEqual(set(result), expected)