def translate(s): ''' Assume we are in frame and translate DNA to amino acids. ''' coding_dna = Seq(s[:(3 * int(len(s) / 3))], Gapped(IUPAC.ambiguous_dna)) return str(coding_dna.translate())
from Bio.Align import AlignInfo from Bio.SubsMat import FreqTable # create the command line to run clustalw # this assumes you've got clustalw somewhere on your path, otherwise # you need to pass the full path of the executable to this via cmd="..." cline = ClustalwCommandline(infile='opuntia.fasta', outfile='test.aln') # actually perform the alignment return_code = subprocess.call(str(cline), shell=(sys.platform != "win32")) assert return_code == 0, "Calling ClustalW failed" # Parse the output alignment = AlignIO.read("test.aln", "clustal", alphabet=Gapped(IUPAC.unambiguous_dna)) print alignment print 'first description:', alignment[0].description print 'first sequence:', alignment[0].seq # get the length of the alignment print 'length', alignment.get_alignment_length() print alignment # print out interesting information about the alignment summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus()
def __init__(self): Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
from Bio.Alphabet import Gapped, SingleLetterAlphabet from Bio.Seq import Seq SPACES = ["-", ".", " ", "~"] SPACE = SPACES[0] MSF_SPACE = SPACES[1] MSF_TERMINAL_SPACE = SPACES[3] GAPPED_ALPHABET = Gapped(SingleLetterAlphabet(), SPACE) class EmptySeq(Seq): def __init__(self): super(EmptySeq, self).__init__(SPACE, GAPPED_ALPHABET) def __len__(self): return 0 def __asseq(self): return Seq(str(self), self.alphabet) def __add__(self, other): return self.__asseq().__add__(other) def __radd__(self, other): return self.__asseq().__radd__(other) EMPTY_SEQ = EmptySeq()
#!/usr/bin/env python from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped import sys #This script takes a FASTA alignment and converts is to a #phylip sequential alignment # check for correct arguments if len(sys.argv) != 3: print("Usage: FastaToPhylip.py <inputfile> <outputfile>") sys.exit(0) input_name = sys.argv[1] output_name = sys.argv[2] input_file = open(input_name, 'r') output_file = open(output_name, 'w') alignment = AlignIO.read(input_file, 'fasta', alphabet=Gapped(IUPAC.ambiguous_dna, '-')) AlignIO.write(alignment, output_file, 'phylip-sequential') input_file.close() output_file.close()
from Bio.Align.Applications import ClustalwCommandline from Bio import AlignIO from Bio.Align import AlignInfo from Bio.SubsMat import FreqTable # create the command line to run clustalw # this assumes you've got clustalw somewhere on your path, otherwise # you need to pass the full path of the executable to this via cmd="..." cline = ClustalwCommandline(infile="opuntia.fasta", outfile="test.aln") # actually perform the alignment return_code = subprocess.call(str(cline), shell=(sys.platform != "win32")) assert return_code == 0, "Calling ClustalW failed" # Parse the output alignment = AlignIO.read("test.aln", "clustal", alphabet=Gapped(IUPAC.unambiguous_dna)) print(alignment) print("first description: %s" % alignment[0].description) print("first sequence: %s" % alignment[0].seq) # get the length of the alignment print("length %i" % alignment.get_alignment_length()) print(alignment) # print out interesting information about the alignment summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus()
def trim_alignment(self, method='edges', remove_probe=None, bases=None, consensus=True, window_size=20, threshold=0.5): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold) elif method == 'running-probe': # get position of probe for k, v in enumerate(self.alignment): if v.name == 'probe': break else: pass start, end = self.running_average(window_size, threshold, k, True) #pdb.set_trace() if method == 'notrim': self.trimmed_alignment = self.alignment else: # create a new alignment object to hold our alignment self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running' or method == 'running-probe') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method if start >= 0 and end: self.trimmed_alignment._records.append( sequence[start:end]) else: self.trimmed_alignment = None break elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence) / 2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point - bases:mid_point + bases]) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0] - bases:self.ploc[1] + bases]) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self. ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1]+bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus and self.trimmed_alignment: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) if not self.trimmed_alignment: print "\tAlignment {0} dropped due to trimming".format( self.alignment._records[0].description.split('|')[1])
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input files are from s6 genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/") ### mkdir output directory for s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/" if os.path.isdir(output_directory) == False: os.makedirs(output_directory) ### iterate each gene for file in os.listdir(genes_result_s6): if file != ".DS_Store": output_directory_file = output_directory + file fasta_name = genes_result_s6 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " +sequence) alignment = AlignIO.read(sequence, 'fasta') # print(alignment) ### generate a new alignment sequences without outgroups. align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for record in alignment: if record.id not in outgroups: # print(record.id) # print(record.seq) align.add_sequence(str(record.id), str(record.seq)) print(align) # print(align.get_alignment_length()) total_wrong_poly_sites = [] ### change alignment to an array. align_array = np.array([list(rec) for rec in align]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = align.get_alignment_length() ### using 20bp-long sliding windows. for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position) > float(Max_p_sites): print(column_position) total_wrong_poly_sites = total_wrong_poly_sites + column_position #print(total_wrong_poly_sites) ### generate the unique positions total_wrong_poly_sites = total_wrong_poly_sites + list(range(10)) total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length)) ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species. unique_wrong_sites = list(np.unique(total_wrong_poly_sites)) print(len(unique_wrong_sites)) # sum2 = alignment[:, total_length:total_length + 1] # for i in unique_wrong_sites: # sum2 = sum2 + alignment[:, i:i+1] # print(sum2) # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip") ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites. ### otherwise, copy the gene to the new folder. if len(unique_wrong_sites) > 0: print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")) cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}") cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col print(cmd) os.system(cmd) else: cmd_2 = "cp " + fasta_name + " " + output_directory_file print(cmd_2) os.system(cmd_2)
True >>> _match_ambiguous_dna('A', 'T') False >>> _match_ambiguous_dna('A', 'A') True """ x = x.upper() y = y.upper() xset = set(ambiguous_dna_values.get(x, x)) yset = set(ambiguous_dna_values.get(y, y)) if not xset.intersection(yset): return False return True DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-') DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y) FLAGS = MavisNamespace(LQ='LOWQUAL') READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR') CALL_METHOD = MavisNamespace(CONTIG='contig', SPLIT='split reads', FLANK='flanking reads', SPAN='spanning reads', INPUT='input') """:class:`MavisNamespace`: holds controlled vocabulary for allowed call methods - ``CONTIG``: a contig was assembled and aligned across the breakpoints - ``SPLIT``: the event was called by :term:`split read` - ``FLANK``: the event was called by :term:`flanking read pair` - ``SPAN``: the event was called by :term:`spanning read` """
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars): """Genotype individuals at SNPs loci. """ win_buffer = (win_len - 1) / 2 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start # GOOD clipe = contig.reads[readn].qa.qual_clipping_end # GOOD clipst2 = contig.reads[readn].qa.align_clipping_start # Added clipe2 = contig.reads[readn].qa.align_clipping_end # Added if clipst2 > clipst: # Added clipst = clipst2 # Added if clipe2 < clipe2: # Added clipe = clipe2 # Added start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name positions = [] try: positions = snp_dict[contig_name] except: continue d = {} for pos in positions: if stars == True: pos_ok = correct_position(pos, sequences[0][1]) else: pos_ok = pos left = pos_ok - 5 if left < 0: left = 0 right = pos_ok + 1 + 5 # takes into account the middle nucleotide ref_window = sequences[0][1][left:right] d.setdefault(pos, {}) d[pos].setdefault("XX_noTag", {}) for nuc in list("ACGTN*-"): d[pos]["XX_noTag"].setdefault(nuc, 0) for tag in tags: d[pos].setdefault(tag, {}) for nuc in list("ACGTN*-"): d[pos][tag].setdefault(nuc, 0) for fasta in sequences: window = fasta[1][left:right] del_count = 0 if window.count("-") > win_buffer - 3: continue # Need at least 3 nucleotides on each side for tag in tags: if tag in fasta[0]: t = tag break else: t = "XX_noTag" if len(ref_window) == len(window): for i in xrange(len(window)): if ref_window[i].isalpha() and window[i] == "*" or \ window[i].isalpha() and ref_window[i] == "*": del_count += 1 if del_count > max_del: continue p = pos s = fasta[1] # Sequence n = s[pos_ok - 1].upper() d[p][t][n] += 1 for p in sorted(d): for t in sorted(d[p]): output_file.write(contig_name + "\t" + str(p) + "\t" + str(t)) for n in list("ACGTN*-"): output_file.write("\t" + str(d[p][t][n])) output_file.write("\n")
from __future__ import division, print_function from Bio.Alphabet import Gapped from Bio.Alphabet.IUPAC import extended_dna, extended_protein GAPS = '_.-=' AminoAlphabet = Gapped(extended_protein) DNAAlphabet = Gapped(extended_dna)
inFile = open(brat_in, 'r') alignLength = align.get_alignment_length() genome = [0]*alignLength for index, line in enumerate(inFile): if index > 0: line = line.strip() wordList = line.split() start = int(wordList[0]) stop = int(wordList[1]) genome[start:stop + 1] = [x+1 for x in genome[start:stop + 1]] recoFreeAlign = align[:, 0:1] for i in range(1, len(genome)): if genome[i] == 0: recoFreeAlign = recoFreeAlign + align[:, i:i+1] return recoFreeAlign # Get command line arguments brat_in, fasta_in = get_arguments(sys.argv[1:]) if brat_in is None or fasta_in is None: usage() sys.exit(2) # Read in BratNextGen File and FASTA alignment align = AlignIO.read(fasta_in, "fasta", alphabet = Gapped(IUPAC.ambiguous_dna, '-')) noRecoAlign = remove_reco(brat_in, align) # output alignment without recombination outName = os.path.splitext(fasta_in)[0] + "noReco.fasta" AlignIO.write(noRecoAlign, outName, "fasta")
def __init__(self, reference_path, patient=None): self.reference_df = pd.DataFrame() # сначала получаем все названия файлов reference_list = [ x for x in os.listdir(reference_path) if 'reference' in x ] # если нужно было выбрать одного пациента, то оставляем только соответствующие названия if patient: reference_list = [x for x in reference_list if f'_{patient}.' in x] # собираем данные из json-ов for name in reference_list: with open(os.path.join(reference_path, name)) as f: json_file = json.load(f) # удаляем ненужные колонки t = pd.DataFrame(data=json_file).drop(['name', 'description'], axis=1) # делим объединённые колонки на отдельные t = pd.concat([ t.drop(['features'], axis=1), t.features.apply(pd.Series) ], axis=1) # переводим в простой список t.location = t.location.apply(pd.Series) # вырезаем последовательность t['region_seq'] = t.apply( lambda x: x.seq[x.location[0]:x.location[1]].strip(), axis=1) # переименовываем для единообразия t.rename(mapper={ 'region_seq': 'sequence', 'seq': 'full_reference' }, axis=1, inplace=True) t['translated'] = t.sequence.apply(lambda x: Seq( x, Gapped(IUPAC.unambiguous_dna)).ungap().translate()) self.reference_df = pd.concat([self.reference_df, t], ignore_index=True) # оставляем только нужные колонки self.reference_df = self.reference_df[[ 'sequence', 'name', 'translated', 'id' ]] # остальное дописываем вручную (это будет использоваться в дереве) self.reference_df['days'] = 0 self.reference_df['frequency'] = 100 self.reference_df['nreads'] = 1 # если это делалось для одного пациента, то сразу отдаём объект с его регионами if patient: self.region = Region(self.reference_df)
def AceIterator(source): """Return SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> with open("Ace/consed_sample.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) ... print(max(record.letter_annotations["phred_quality"])) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> with open("Ace/contig1.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s ...%s..." % (record.id, record.seq[85:95])) ... print(record.letter_annotations["phred_quality"][85:95]) ... print(max(record.letter_annotations["phred_quality"])) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(source): # Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence # Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: # Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: # For consistency with most other file formats, map # any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) # TODO? - Base segments (BS lines) which indicates which read # phrap has chosen to be the consensus at a particular position. # Perhaps as SeqFeature objects? # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) # Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) # Consensus base quality (BQ lines). Note that any gaps (originally # as * characters) in the consensus do not get a quality entry, so # we assign a quality of None (zero would be misleading as there may # be excellent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes += 1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def run(self, consensusThreshold): from Bio import AlignIO, SeqIO from Bio.Align import AlignInfo # from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import IUPAC, Gapped # from Bio.Seq import Seq # from Bio.SeqRecord import SeqRecord # Directory where files are # os.chdir(sys.argv[1]) # listing = os.listdir(".") listing = os.listdir(self.pathToCladesAlignments) consensus = {} genConsensus = '' pssmGen = '' # this value should be read from the arguments or else use a default consensusThres = consensusThreshold # sys.argv[2] holds the path to the general alignment generalAlignment = AlignIO.parse(self.generalAlignment, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 positionsToMask = [] for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) for index, residue in enumerate(genConsensus): if genConsensus[index] == '-': continue if genConsensus[index] == 'X': continue positionsToMask.append(index) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) print positionsToMask print listing resultAlignFiles = [] for item in listing: if item.endswith(".fas"): #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) alignments = AlignIO.parse(self.pathToCladesAlignments + item, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) for alignment in alignments: summ = AlignInfo.SummaryInfo(alignment) consensus[item] = summ.gap_consensus(consensusThres) for posToMask in positionsToMask: if consensus[item][posToMask] == '-': continue for alignElement in alignment: mutSeq = alignElement.seq.tomutable() mutSeq[posToMask] = 'X' alignElement.seq = mutSeq.toseq() SeqIO.write( alignment, self.outPutPath + item + "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, ), "fasta") resultAlignFiles.append(self.outPutPath + item + "_noPKSsignal_Thres%d.faa" % (consensusThres * 100, )) summ = AlignInfo.SummaryInfo(alignment) consensus[item] = summ.gap_consensus(consensusThres) print item, consensus[item] return resultAlignFiles
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input directory from s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/" output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/") if os.path.isdir(output_directory_2) == False: os.makedirs(output_directory_2) ### iterate each gene for file in os.listdir(output_directory_1): if file != ".DS_Store": output_directory_file = output_directory_2 + file fasta_name = output_directory_1 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " + sequence) alignment = AlignIO.read(sequence, 'fasta') ### calculate the polymorphism in outgroup ### change alignment to an array. total_wrong_poly_sites_outgroup = [] align_array_outgroup = np.array([list(rec) for rec in alignment]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = alignment.get_alignment_length() # alignment = AlignIO.read(sequence, 'fasta') for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position_outgroup = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array_outgroup[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position_outgroup) > float(Max_p_sites_o): print(column_position_outgroup) total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup)) print(unique_wrong_sites_ougroup) print("outgroup") align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for record in alignment: new_seq = "" if record.id in outgroups: print(record.seq) for i in range(total_length): if i in unique_wrong_sites_ougroup: new_seq = new_seq + "-" else: new_seq = new_seq + str(record.seq[i]) align_2.add_sequence(str(record.id), str(new_seq)) else: align_2.add_sequence(str(record.id), str(record.seq)) print(align_2) AlignIO.write(align_2, output_directory_file, "fasta")
# University of Florida parser = argparse.ArgumentParser() parser.add_argument("-i", help="input Phylip formatted file") parser.add_argument("-o", help="output filename") parser.add_argument("-a", help="Alphabet: dna or aa, default=dna", default="dna") args = parser.parse_args() infile = args.i outfile = args.o alphabet = args.a try: IN=open(infile, 'r') except IOError: print "Can't open file", infile try: OUT=open(outfile, 'a') except IOError: print "Can't open file", outfile if alphabet == "dna": alignment = AlignIO.read(IN, "phylip-relaxed", alphabet=Gapped(IUPAC.ambiguous_dna)) AlignIO.write([alignment], OUT, "nexus") elif alphabet == "aa": alignment = AlignIO.read(IN, "phylip-relaxed", alphabet=Gapped(IUPAC.protein)) AlignIO.write([alignment], OUT, "nexus")
#!/usr/bin/env python """Example of generating a substitution matrix from an alignment. """ # standard library from __future__ import print_function # Biopython from Bio import SubsMat from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import AlignInfo # get an alignment object from a Clustalw alignment output c_align = AlignIO.read('protein.aln', 'clustal', alphabet=Gapped(IUPAC.protein)) summary_align = AlignInfo.SummaryInfo(c_align) # get a replacement dictionary and accepted replacement matrix # exclude all amino acids that aren't charged polar replace_info = summary_align.replacement_dictionary([ "G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C" ]) my_arm = SubsMat.SeqMat(replace_info) print(replace_info) my_lom = SubsMat.make_log_odds_matrix(my_arm) print('log_odds_mat: %s' % my_lom)
def write_fasta(chromosome, RGID, refID): """Writes a RGA fasta alignment for each vcf""" outFile = RGID + "_RGA_pilon.fasta" Sample = "pilon_" + RGID record = SeqRecord(Seq("".join(chromosome), Gapped(IUPAC.ambiguous_dna, '-')), id=Sample, description = "RGA_to_" + refID) SeqIO.write(record, outFile, "fasta")