def load_csv_file(file, delimiter=";"): """ This function loads a "Primer" file. @returns: List of PrimerPair instances """ pos = { "id": 0, "forwardPrimer": 0, "reversePrimer": 0, "fPDNA": 0, "rPDNA": 0, "ampliconMinLength": 0, "ampliconMaxLength": 0 } header_len = len(pos) primer_dict = {} with open(file, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter=delimiter) headers = next(csvreader) if (len(headers) != header_len): raise ValueError("Wrong header") for i in range(len(headers)): if (headers[i] not in pos): raise ValueError("Unknown header " + headers[i]) pos[headers[i]] = i i = 1 for row in csvreader: i += 1 if (len(row) == header_len): fprimer = Seq(row[pos["fPDNA"]], IUPAC.IUPACAmbiguousDNA()) fprimer = SeqRecord(fprimer) fprimer.id = row[pos["forwardPrimer"]] rprimer = Seq(row[pos["rPDNA"]], IUPAC.IUPACAmbiguousDNA()) rprimer = SeqRecord(rprimer) if (True): #TODO rprimer = rprimer.reverse_complement() rprimer.id = row[pos["reversePrimer"]] primer_pair = PrimerPair((row[pos["id"]]), fprimer, rprimer, int(row[pos["ampliconMinLength"]]), int(row[pos["ampliconMaxLength"]])) if (check_primer_pair_integrity(primer_pair)): primer_dict[row[pos["id"]]] = primer_pair else: logging.warning("Skipping primer pair " + primer_pair.id + ", bad sequence") else: logging.warning("Wrong primer pair in line " + str(i)) return primer_dict
def stage_one_trimming(alignment, window_size, proportion, threshold, min_len): """ --------------------------------------------------------------------- MODIFIED FUNCTION FROM PHYLUCE: generic_align.py --------------------------------------------------------------------- First stage alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ start, end = running_average(alignment, window_size, proportion, threshold) s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: s1_trimmed.append(sequence[start:end]) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def genome_to_seqrecord(phage_genome): """Creates a SeqRecord object from a pdm_utils Genome object. :param phage_genome: A pdm_utils Genome object. :type phage_genome: Genome :returns: A BioPython SeqRecord object :rtype: SeqRecord """ assert phage_genome != None,\ "Genome object passed is None and not initialized" try: record = SeqRecord(phage_genome.seq) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() except AttributeError: print("Genome object failed to be converted to SeqRecord.", "Genome valid attribute 'seq' is required to", "convert to SeqRecord object.") raise record.name = phage_genome.name if phage_genome.accession != "": record.id = phage_genome.accession record.features = get_seqrecord_features(phage_genome) record.description = get_seqrecord_description(phage_genome) record.annotations=\ get_seqrecord_annotations(phage_genome) return record
def find_gapped_columns(align, cfg): """Find all columns that contain more gaps than the cfg setting using a sliding windows""" max_gap_proportion = cfg["max_gap_proportion"] nbr_sequences = len(align) columns_to_remove = [] len_align = align.get_alignment_length() for index in range(len_align): column = str(align[:, index]).replace("n", "N") gap_freq = (column.count("N") + column.count("-")) / nbr_sequences if gap_freq > max_gap_proportion: columns_to_remove.append(index) if columns_to_remove: idxs = [x for x in range(len_align) if x not in columns_to_remove] trimmed_records = [] for rec in align: L_seq = list(rec.seq) new_seq = "".join([L_seq[i] for i in idxs]) new_rec = SeqRecord( Seq(new_seq, IUPAC.IUPACAmbiguousDNA()), name=rec.name, id=rec.id, description="", ) trimmed_records.append(new_rec) return trimmed_records return align
def main(): logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument('--fasta', dest='fasta_file', metavar='STRING', required=True, type=str) parser.add_argument('--num_fragments', dest='num_fragments', metavar='int', required=True, type=int) parser.add_argument('--mean_frag_size', dest='frag_size_mu', metavar='int', required=True, type=int) parser.add_argument('--frag_size_std', dest='frag_size_sigma', metavar='int', required=True, type=int) parser.add_argument('--mean_mutation_rate', dest='mutation_rate_mu', metavar='float', required=True, type=float) parser.add_argument('--mutation_rate_std', dest='mutation_rate_sigma', metavar='float', required=True, type=float) parser.add_argument('--output', dest='output_file', metavar='string', required=True, type=str) args = parser.parse_args() outhandle = open(args.output_file, 'w') generated_seqs = [] for record in SeqIO.parse(args.fasta_file, 'fasta'): base_id = record.id base_seq = str(record.seq) while len(generated_seqs) < args.num_fragments: try: mutation_rate = rnd.gauss(args.mutation_rate_mu, args.mutation_rate_sigma) subsequence = subselect_sequence(base_seq, args.frag_size_mu, args.frag_size_sigma) mutated_subsequence = mutate(subsequence, mutation_rate) new_id = '%s__mut_%.2f__len_%i' % (base_id, mutation_rate, len(subsequence)) generated_seqs.append(SeqRecord(Seq(mutated_subsequence, IUPAC.IUPACAmbiguousDNA()), id=new_id, name=new_id, description='')) except Exception as e: print(e) SeqIO.write(generated_seqs, outhandle, 'fasta') outhandle.close()
def get_sine_forward(sine_fname): """Only in direction given in file.""" [sine_record] = SeqIO.parse(sine_fname, "fasta", alphabet=IUPAC.IUPACAmbiguousDNA()) # TODO: If we return it as dumb string, why did we bother about the alphabet? # TODO: The reference SINEs do contain a couple ambiguous chars - N, Y. return str(sine_record.seq)
def get_sines(sine_fname): """As given in file + reverse complements.""" for (i, sine_record) in enumerate(SeqIO.parse(sine_fname, "fasta")): cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA()) yield str(cur_seq) cur_seq_rc = cur_seq.reverse_complement() yield str(cur_seq_rc) print(cur_seq, cur_seq_rc, '''\n ======================''')
def set_primer_seqs(self, fwd_sequence, rev_sequence): """Set the primer sequences. Set the primer sequences from the given forward and reverse sequences. Parameters ---------- fwd_sequence : string forward primer sequence - ambiguities allowed. rev_sequence : string reverse primer sequence - ambiguities allowed. """ fwd_primer = Seq(fwd_sequence, IUPAC.IUPACAmbiguousDNA()) rev_primer = Seq(rev_sequence, IUPAC.IUPACAmbiguousDNA()) self.logger.info("Setting foward primer to " + fwd_sequence) self.logger.info("Setting reverse primer to " + rev_sequence) self._primer_pair = (fwd_primer, rev_primer)
def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None): """ to replace proteinToCodonAlignment() """ protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) dnaAlignFasta = StringIO() prot_align_len = proteinAlignment.get_alignment_length() for seqId in dnaSeqDict: dnaSeq = dnaSeqDict[seqId].seq if len(dnaSeq) < 3 * prot_align_len: # this is to handle cases where protein exists but DNA does not dnaSeq += '---' * (prot_align_len - len(dnaSeq)) protSeq = protSeqDict[seqId].seq dnaAlignFasta.write(">" + seqId + "\n") dnaSeqPos = 0 for protPos in range(0, len(protSeq)): if protSeq[protPos] == '-': codon = '---' else: # TODO: in future use a codon table to check correct matching codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3]) dnaSeqPos += 3 dnaAlignFasta.write(codon) protPos += 1 # should now be equal to prot_align_len if Debug: LOG.write( seqId + " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n" .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq))) if protPos < prot_align_len: dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos))) LOG.write( "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n" .format(seqId, protPos, prot_align_len, len(dnaSeq), len(protSeq))) dnaAlignFasta.write("\n") dnaAlignFasta_text = dnaAlignFasta.getvalue() retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta') return retval
def write_fasta(sequences, filename, field=""): records = [] for sq in sequences: if field == "original": for k in range(2): records.append( SeqRecord(Seq(sq["seqs"][k], IUPAC.IUPACAmbiguousDNA()), id=sq["names"][k], description="")) elif field == "trimmed" or sq["merged_seq"] is None: for k in range(2): records.append( SeqRecord(Seq(sq["trimmed_seqs"][k], IUPAC.IUPACAmbiguousDNA()), id=sq["names"][k], description="({})".format("unmerged" if field == "merged" else field))) elif field == "merged": records.append( SeqRecord(Seq(sq["merged_seq"], IUPAC.IUPACAmbiguousDNA()), id=", ".join(sq["names"]), description="(merged)")) SeqIO.write(records, filename, "fasta")
def to_seq_record(self): """Convert the Gene to a SeqRecord""" #build a list of features feats = [_f.to_seq_feature() for _f in self.features.all()] #build a dictionary of annotations & refs annot = {} for a in self.annotations.all(): a.to_ann(annot) annot['references'] = [r.to_ref() for r in self.references.all()] return SeqRecord(seq=Seq(self.sequence, IUPAC.IUPACAmbiguousDNA()), name=self.name, description=self.description, features=feats, annotations=annot)
def translate(self, align, offset): """ Translate the alignment according to the selected frame which is set according to 'offset' value """ end = ((align.get_alignment_length() - offset) // 3) * 3 + offset t_align = MulAlign([], Gapped(IUPAC.ExtendedIUPACProtein(), "N")) for rec in align: seq = str(rec.seq).upper().replace("-", "N").replace("n", "N") new_seq = Seq(seq, IUPAC.IUPACAmbiguousDNA())[offset:end].translate() new_rec = SeqRecord(new_seq, name=rec.name, id=rec.id, description="") t_align.append(new_rec) return t_align
def test_reverse_complements(self): """Test double reverse complement preserves the sequence.""" sorted_amb_rna = sorted(ambiguous_rna_values) sorted_amb_dna = sorted(ambiguous_dna_values) for sequence in [ Seq.Seq("".join(sorted_amb_rna)), Seq.Seq("".join(sorted_amb_dna)), Seq.Seq("".join(sorted_amb_rna), Alphabet.generic_rna), Seq.Seq("".join(sorted_amb_dna), Alphabet.generic_dna), Seq.Seq("".join(sorted_amb_rna).replace("X", ""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted_amb_dna).replace("X", ""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG"), ]: # Note no U or T reversed_sequence = sequence.reverse_complement() self.assertEqual(str(sequence), str(reversed_sequence.reverse_complement()))
def cds_to_seqrecord(cds): try: record = SeqRecord(cds.seq) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() except AttributeError: print("Genome object failed to be converted to SeqRecord\n." "Genome valid attribute 'seq' is required to " "convert to SeqRecord object.") record.name = cds.id if cds.locus_tag != "": record.id = cds.locus_tag cds.set_seqfeature() record.features = [cds.seqfeature] record.description = f"Single gene {cds.id}" record.annotations = get_cds_seqrecord_annotations(cds) return record
def stage_one_trimming(self, alignment, window_size, proportion, threshold, min_len, replace_ends=False): """ First stage (of 3) alignment trimming to find and trim edges of a given alignment. Calls running_average function above to determine reasonable alignment start and end trimming for the entire alignment block. """ # get the trim positions that we determine begin and end "good" # alignments start, end = self.running_average(alignment, window_size, proportion, threshold) # create a new alignment object to hold our alignment s1_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) for sequence in alignment: # ensure correct sequence alphabet or we'll get a conflict when # we try to generate a consensus sequence.seq.alphabet = IUPAC.IUPACAmbiguousDNA() if start >= 0 and end: trim = sequence[start:end] # ensure we don't just add a taxon with only gaps/missing # data and that alignments are >= min_len if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: if not replace_ends: s1_trimmed.append(sequence[start:end]) else: # replace end gaps with missing data character ? # called on third iteration of trimming repl = self._replace_ends(str(sequence[start:end].seq)) s1_trimmed.append( self._record_formatter(repl, sequence.id)) else: s1_trimmed = None break else: s1_trimmed = None break return s1_trimmed
def _extract_clusters(self, tag, qual='ugene_name'): tagre = re.compile(tag) clusters = {} records = SeqView() records.load(self.genomes_files) for record in records: for f in record.features: if qual in f.qualifiers: q = ' '.join(f.qualifiers[qual]) if not tagre.match(q): continue c = f.extract(record) c.id = c.name = q c.description = record.description if c.seq.alphabet is not NucleotideAlphabet \ or c.seq.alphabet is not ProteinAlphabet: c.seq.alphabet = IUPAC.IUPACAmbiguousDNA() self._process_features(c) clusters[c.id] = c return clusters
def save_record_fasta(record_df, save_to_dir): records_grouped = record_df.groupby('HGNC') genes = list(records_grouped.groups.keys()) for g in genes: records_grouped.get_group(g) out_dir = save_to_dir / g if not out_dir.exists(): out_dir.mkdir() out_file = out_dir / '{}_sequences.fasta'.format(g) seq_list = [] for i, r in records_grouped.get_group(g).iterrows(): seq_list.append( SeqRecord(Seq(r.Sequence, IUPAC.IUPACAmbiguousDNA()), id=r.entrez_id, name=r.HGNC, description=r.Description)) SeqIO.write(seq_list, out_file, 'fasta') return seq_list
def ageSequence(rec, outfile, freq, end_length, seed, logger=None): assert logger is not None, "must use logging" logger.info("frequncy of mutation: %f", freq) newseqlist = list(rec.seq) alph = ["A", "T", "C", "G"] seqlen = len(rec.seq) if end_length is None or end_length is 0: ignore_region = [] elif not seqlen - (2 * end_length) > 1: raise ValueError("Edge width cannot be greater than half the " + "length of the sequence ") else: ignore_region = set([ idx for sublist in [range(end_length, seqlen - end_length)] for idx in sublist ]) newseqlist = list(rec.seq) seq_len = len(newseqlist) random.seed(seed) # subst_idxs = random.sample(range(0, seq_len), int(round(seq_len * freq))) idxs = list(range(0, seq_len)) random.shuffle(idxs) subst_idxs = idxs[0:int(round(seq_len * freq))] # ignore the indexes in the regions we are leaving unchanaged executed_subst_idxs = [x for x in subst_idxs if x not in ignore_region] for i in executed_subst_idxs: if i in ignore_region: pass else: substitute_base(strlist=newseqlist, position=i, alph=alph) logger.info("Changed %d of %d bases", len(executed_subst_idxs), seq_len) newrec = SeqRecord( id=rec.id, # description="riboSim mutation frequency" + str(freq), seq=Seq("".join(newseqlist), IUPAC.IUPACAmbiguousDNA())) with open(outfile, "a") as o: SeqIO.write(newrec, o, "fasta") assert len(newseqlist) == len(rec.seq), \ "something bad happened! unequal lengths of input and output sequences"
def search_sines2(sine_f, r1_f, to_check = {0,1,2}, step_print = 10000, nlines = 100000): sine_set = [] stats = collections.Counter() for (i,sine_record) in enumerate(SeqIO.parse(sine_f, "fasta")): if (i in to_check): cur_seq = Seq(str(sine_record.seq), IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) print(cur_seq, cur_seq_rc, '''\n ======================''') for sine in sine_set: matcher = difflib.SequenceMatcher(isjunk=None, a=sine) total = 0 cnt = 0 start_time = time() print('''sequences for sine = ''') for cur_seq in r1_f: total += 1 matcher.set_seq2(cur_seq) res = matcher.find_longest_match(0, len(sine), 0, len(cur_seq)) d = res[2] stats[d] += 1 if (total % step_print == 0 or total == nlines): print('''distances for first''', total, '''segments \n''') print('''========================''') print('''time elapsed''', (time() - start_time)/60.0, '''minutes''') for k in sorted(stats): print('longest common =', k, 'num matches =', stats[k], '''/''',cnt) if (total == nlines): break
def setUp(self): self.test_seqs = [ Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna), Seq.Seq("ATGAAACTG"), Seq.Seq("ATGAARCTG"), Seq.Seq("AWGAARCKG"), # Note no U or T Seq.Seq("".join(ambiguous_rna_values)), Seq.Seq("".join(ambiguous_dna_values)), Seq.Seq("".join(ambiguous_rna_values), Alphabet.generic_rna), Seq.Seq("".join(ambiguous_dna_values), Alphabet.generic_dna), Seq.Seq("".join(ambiguous_rna_values), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(ambiguous_dna_values), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG", Alphabet.generic_dna), Seq.Seq("AUGAAACUG", Alphabet.generic_rna), Seq.Seq("ATGAAACTG", IUPAC.unambiguous_dna), Seq.Seq("ATGAAACTGWN", IUPAC.ambiguous_dna), Seq.Seq("AUGAAACUG", Alphabet.generic_rna), Seq.Seq("AUGAAACUG", IUPAC.unambiguous_rna), Seq.Seq("AUGAAACUGWN", IUPAC.ambiguous_rna), Seq.Seq("ATGAAACTG", Alphabet.generic_nucleotide), Seq.MutableSeq("ATGAAACTG", Alphabet.generic_dna), Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna), ]
def concat_genome(input_dir, ext, outpath, verbose=False): """for each fasta, read in, add to existing string, and when finished, write out as single-entry fasta """ fastas = sorted(glob.glob(str(input_dir + ext))) if len(fastas) == 0: if verbose: print("No files found!") return 1 if verbose: print(str("combining the following files matching extension " + "{0}:{1}".format(ext, " ".join(fastas)))) new_seq = "" for filen in fastas: print("Adding %s to combined sequence" % filen) with open(filen, 'r') as i_file: seq_rec = list(SeqIO.parse(i_file, 'fasta'))[0] new_seq = new_seq + str(seq_rec.seq) if verbose: print(str("Len of sequence:{0}\nLen of concatenated " + "sequence:{1}").format(len(seq_rec), len(new_seq))) try: with open(outpath, 'w') as o_file: success = SeqIO.write( SeqRecord( seq=Seq(new_seq, IUPAC.IUPACAmbiguousDNA()), id="concatenated_genome"), o_file, 'fasta') if success: print("wrote out concatenated file!") return 0 except Exception as e: if verbose: print(e) return 1
def cds_to_seqrecord(cds, parent_genome): """Creates a SeqRecord object from a Cds and its parent Genome. :param cds: A populated Cds object. :type cds: Cds :param phage_genome: Populated parent Genome object of the Cds object. :returns: Filled Biopython SeqRecord object. :rtype: SeqRecord """ record = SeqRecord(cds.translation) record.seq.alphabet = IUPAC.IUPACAmbiguousDNA() record.name = cds.id if cds.locus_tag != "": record.id = cds.locus_tag cds.set_seqfeature() record.features = [cds.seqfeature] record.description = ( f"{cds.description} " f"[{parent_genome.host_genus} phage {cds.genome_id}]") record.annotations = get_cds_seqrecord_annotations(cds, parent_genome) return record
print "%s={%s} --> {%s}=%s" % \ (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char]) assert set(compl_values) == set( ambiguous_rna_values[ambiguous_rna_complement[ambig_char]]) print print "Reverse complements:" for sequence in [ Seq.Seq("".join(sorted(ambiguous_rna_values))), Seq.Seq("".join(sorted(ambiguous_dna_values))), Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna), Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna), Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG") ]: # Note no U or T print "%s -> %s" \ % (repr(sequence), repr(Seq.reverse_complement(sequence))) assert str(sequence) \ == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \ "Dobule reverse complement didn't preserve the sequence!" print ########################################################################### test_seqs = [ s, t, u,
outputName = sys.argv[3] refDict = {} strainDict = {} chrListRef = [] #read in fasta #build dictionary with chr as key and sequence as value refGenome = open(refGenomeName, 'r') for seq_record in SeqIO.parse(refGenome, "fasta"): refDict[seq_record.id] = seq_record.seq idStr = str(seq_record.id) chrListRef.append(idStr) seqStr = str(seq_record.seq) strainDict[idStr] = MutableSeq(seqStr, IUPAC.IUPACAmbiguousDNA()) outIndelName = outputName + ".indel" outIndelFile = open(outIndelName, 'w') vcf = open(strainVCF, 'r') lines = vcf.readlines() counter = 0 for line in lines: currentLine = line.strip('\n') if re.match('^#', currentLine): #Looks for lines that start with # header = currentLine #print "this is just the header" else: SNPline = currentLine.split() #Split based on tabs chr = SNPline[0]
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None): protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) for seqId in dnaSeqDict: if not len(dnaSeqDict[seqId].seq): #del(dnaSeqDict[seqId]) LOG.write("warning: seqId %s length of dna was zero\n" % seqId) dnaSeqRecords = [] for proteinSeq in proteinAlignment: dnaSeqRecords.append(dnaSeqDict[proteinSeq.id]) if Debug: LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords))) #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict)))) #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict)))) #LOG.write("first two aligned DNA seqs:\n") #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta") #LOG.flush() """ # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons for i, protRec in enumerate(proteinAlignment): protSeq = str(protRec.seq) protSeq.replace('-','') protLen = len(protSeq) if len(dnaSeqs[i].seq) < protLen*3: shortfall = (protLen*3) - len(dnaSeqs[i].seq) if Debug: LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall)) # extend on both ends to be safe dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall """ returnValue = None #with warnings.catch_warnings(): #warnings.simplefilter('ignore', BiopythonWarning) #try: #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'} #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'} #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values) #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000) returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, max_score=1000) for dnaSeq in returnValue: proteinRecord = protSeqDict[dnaSeq.id] if proteinRecord.annotations: dnaSeq.annotations = proteinRecord.annotations.copy() #except Exception as e: # LOG.write("problem in codonalign, skipping\n%s\n"%str(e)) # raise(e) return returnValue
print "RNA Ambiguity mapping:", sorted_dict(ambiguous_rna_values) print "RNA Complement mapping:", sorted_dict(ambiguous_rna_complement) for ambig_char, values in sorted(ambiguous_rna_values.iteritems()): compl_values = complement(values).replace("T","U") # need to help as no alphabet print("%s={%s} --> {%s}=%s" % \ (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char])) assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]]) print print("Reverse complements:") for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))), Seq.Seq("".join(sorted(ambiguous_dna_values))), Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna), Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna), Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X",""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X",""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG")]: # Note no U or T print("%s -> %s" \ % (repr(sequence), repr(Seq.reverse_complement(sequence)))) assert str(sequence) \ == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \ "Dobule reverse complement didn't preserve the sequence!" print ########################################################################### test_seqs = [s,t,u, Seq.Seq("ATGAAACTG"), "ATGAAACtg", #TODO - Fix ambiguous translation #Seq.Seq("ATGAARCTG"),
) from Bio.Data.CodonTable import TranslationError, standard_dna_table from Bio.Seq import MutableSeq test_seqs = [ Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna), Seq.Seq("T", IUPAC.ambiguous_dna), Seq.Seq("ATGAAACTG"), Seq.Seq("ATGAARCTG"), Seq.Seq("AWGAARCKG"), # Note no U or T Seq.Seq("".join(ambiguous_rna_values)), Seq.Seq("".join(ambiguous_dna_values)), Seq.Seq("".join(ambiguous_rna_values), Alphabet.generic_rna), Seq.Seq("".join(ambiguous_dna_values), Alphabet.generic_dna), Seq.Seq("".join(ambiguous_rna_values), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(ambiguous_dna_values), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG", Alphabet.generic_dna), Seq.Seq("AUGAAACUG", Alphabet.generic_rna), Seq.Seq("ATGAAACTG", IUPAC.unambiguous_dna), Seq.Seq("ATGAAA-CTG", Alphabet.generic_dna), Seq.Seq("ATGAAACTGWN", IUPAC.ambiguous_dna), Seq.Seq("AUGAAACUG", Alphabet.generic_rna), Seq.Seq("AUGAAA==CUG", Alphabet.generic_rna), Seq.Seq("AUGAAACUG", IUPAC.unambiguous_rna), Seq.Seq("AUGAAACUGWN", IUPAC.ambiguous_rna), Seq.Seq("ATGAAACTG", Alphabet.generic_nucleotide), Seq.Seq("AUGAAACTG", Alphabet.generic_nucleotide), # U and T Seq.MutableSeq("ATGAAACTG", Alphabet.generic_dna), Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna), Seq.Seq("ACTGTCGTCT", Alphabet.generic_protein), ]
def get_biop_motif(self, cluster_num, motif_num, option='sites'): ##import egrin2.export_motifs as em """export the specified motif to a biopython motif object Parameters: - cluster_num: bicluster number - motif_num: motif number - option of how to translate - sites: jaspar 'sites' file; pfm: jaspar 'pfm' file """ #conn = sql3.connect(self.dbfile) #cursor = conn.cursor() #cursor.execute('select max(iteration) from motif_infos') #iteration = cursor.fetchone()[0] #query = 'select rowid from motif_infos where iteration=? and cluster=? and motif_num=?' #params = [self.iteration, cluster_num, motif_num] #cursor.execute(query, params) #rowid = cursor.fetchone()[0] #motif_infos = self.tables['motif_infos'] #rowid = motif_infos[(motif_infos.iteration==self.iteration) & # (motif_infos.cluster==cluster_num) & (motif_infos.motif_num==motif_num)].index.values[0]+1 rowid = self.__get_motif_id(cluster_num, motif_num) #mot_info = pd.read_sql('select * from motif_infos where rowid=?', conn, params=[rowid]) #mot_sites = pd.read_sql('select * from meme_motif_sites where motif_info_id=?', conn, params=[rowid]) mot_sites = self.tables['meme_motif_sites'][self.tables['meme_motif_sites'].motif_info_id == rowid] output = StringIO() ## ONE WAY TO TRY -- but Bio.motifs cant parse the incomplete MEME file ##output.write(em.MEME_FILE_HEADER % (0.25, 0.25, 0.25, 0.25)) ##em.write_pssm(output, cursor, os.path.dirname(self.dbfile), cluster_num, rowid, ## motif_num, mot_info['evalue'][0], 10) ##output.seek(0) ##mot = motifs.read( output, 'meme' ) ## Second way - create a jaspar 'pfm' file from the pssm if option == 'pfm': #query = 'select a,c,g,t from motif_pssm_rows where iteration=? and motif_info_id=?' #params = [self.iteration, rowid] #pssm = pd.read_sql( query, conn, params=params ) motif_pssm_rows = self.tables['motif_pssm_rows'] pssm = motif_pssm_rows[(motif_pssm_rows.iteration==self.iteration) & (motif_pssm_rows.motif_info_id==rowid)] pssm = pssm.drop( ['motif_info_id', 'iteration', 'row'], 1 ) counts = np.round( pssm * mot_sites.shape[0] ).transpose() counts.to_string(output, header=False, index=False ) output.seek(0) mot = motifs.read( output, 'pfm' ) ## Third way - create a jaspar 'sites' file elif option == 'sites': seqs = {} for i in mot_sites.index.values: name = mot_sites.ix[i].seq_name flank_left = mot_sites.ix[i].flank_left flank_left = Seq(flank_left if flank_left is not None else "", IUPAC.IUPACAmbiguousDNA()).lower() seq = Seq(mot_sites.ix[i].seq, IUPAC.IUPACAmbiguousDNA()) flank_right = mot_sites.ix[i].flank_right flank_right = Seq(flank_right if flank_right is not None else "", IUPAC.IUPACAmbiguousDNA()).lower() full_seq = flank_left + seq + flank_right bs = SeqRecord( full_seq, id=name ) seqs[i] = bs SeqIO.write(seqs.values(), output, 'fasta') output.seek(0) mot = motifs.read( output, 'sites' ) output.close() ## Note Bio.motifs.weblogo() uses the weblogo server (slow? requires connection.) #kwargs = dict(color_scheme='classic') #mot.weblogo('file.png', color_scheme='color_classic') ## note, can use format='PDF' #img = mpimg.imread('file.png') #imgplot = plt.imshow( img ) #plt.show() return mot
def search_sines(sine_f, r1_f, override = 0, upper_mut_dist = 30, step_print = 10000, nlines = 500000, sine_l = 80): print ('override =',override) sine_set = [] stats = collections.Counter() global bar_codes bar_codes = {} global detailed_stats detailed_stats = collections.Counter() global distances_from_combined_regexp distances_from_combined_regexp = {} matcher = difflib.SequenceMatcher() for sine_record in SeqIO.parse(sine_f, "fasta"): cur_seq = Seq(str(sine_record.seq)[:sine_l], IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) print(cur_seq, cur_seq_rc, '''\n ======================''') complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) if override == 1: bases = ['A','C','G','T'] ind_list = [random.randrange(4) for i in range(sine_l)] r_sine = ''.join( [bases[ind_list[i]] for i in range(sine_l)] ) r_sine_rc = ''.join( [bases[3-ind_list[i]] for i in range(sine_l)] ) sine_set = [r_sine, r_sine_rc] complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) # Also specifies the shift range if override > 1: if override > 2: d = override - 1 #random.randrange(2, override) print('skipping ',d) for (i,cur_seq) in enumerate(r1_f): if i == d: break sine_set = [] for (i,s) in enumerate(r1_f): cur_seq = Seq(s[:sine_l], IUPAC.IUPACAmbiguousDNA()) cur_seq_rc = cur_seq.reverse_complement() sine_set.append(str(cur_seq)) sine_set.append(str(cur_seq_rc)) if i == 2: break complete_regexp = '''|'''.join(sine_set) p = tre.compile(complete_regexp, tre.EXTENDED) total = 0 cnt = 0 start_time = time() print('''sequences = ''') bar_code_len = 60 for cur_seq in r1_f: total += 1 m = p.search(cur_seq, tre.Fuzzyness(maxerr = upper_mut_dist)) if m: res = m.group(0) d = m.cost # Filter out strings that were cut out. Approximate by max-length matches # 10 is arbitrary, not very small if (m.groups()[0][1] < len(cur_seq) - 10) and (m.groups()[0][0] > 40): # print(m.groups(), len(cur_seq)) cnt += 1 stats[d] += 1 bar_code = cur_seq[m.groups()[0][0] - 40 : m.groups()[0][0]] if bar_code in bar_codes: bar_codes[bar_code] += 1 else: bar_codes[bar_code] = 1 detailed_stats[res] += 1 distances_from_combined_regexp[res] = d if (total % step_print == 0 or total == nlines): print('''distances for first''', total, '''segments \n''') print('''========================''') print('''time elapsed''', (time() - start_time)/60.0, '''minutes''') for k in sorted(stats): print('edit distance =', k, 'matches =', stats[k], '''/''',cnt) if (total == nlines): break
def assemble_alleles(fasta, cfg, **kargs): """ Main function for STEP7: Allele fragments are assembled in order to recover the full length transcript. interp. overlap_threshold (int) Parameter that plays two roles in the analysis: 1. in order to investigate two fragment from the same origin for they cannot overlap by more than this value. 2. in order to assemble two fragment f0, f1, there must be a fragment t from a different taxon that overlap each of the two sequences by overlapping_threshold / 2. f0 and f1 are assembled if they are both in the overlap region among the closest sequences from t. kmer (int) The number of non ambiguous consecutive bases to mark the beginning and end of a sequence. overlap_proportion (float) When testing for homology of two non (low) overlapping sequences f0, f1 from taxon t0; we use a search the set of sequences from the other taxa that largely overlap both f0 and f1. The number of taxa to be search is (number seq that overlap) * overlap_proportion max_alleles_distance (float) indicates the max distance allowed between 2 alleles. mistmatch_cost (int) Penalty cost for mismatch in the overlapping flanking regions of putative alleles. overlap_cost (int) Penalty cost for overlap in the flanking regions of putative alleles. """ in_format = cfg["input_format"] out_suffix = cfg["output_suffix"] out_folder = cfg["output_folder"] taxa_without_alleles = cfg["taxa_without_alleles"] if taxa_without_alleles: taxa_no_allele = taxa_without_alleles else: taxa_no_allele = [] kmer = cfg["kmer"] overlap_ratio = cfg["overlap_proportion"] overlap_threshold = cfg["overlap_threshold"] max_allele_dist = cfg["max_alleles_distance"] overlap_cost = cfg["overlap_cost"] mistmatch_cost = cfg["mistmatch_cost"] basename = fasta.split(in_format)[0] try: assert (kmer) < 50 except: s = "'kmer' > 50, unrealistic parameter value" raise ValueError(s) sys.exit(s) try: cfg["dna_model"] in [0, 1, 2, 3, 4, 5] except: s = "Wrong parameters: 'dna_model' must be in [0, 1, 2, 3, 4, 5]" raise ValueError(s) sys.exit(s) try: align = AlignIO.read(fasta, "fasta") except: exception = "[Error] Problem opening fasta alignment {}\n\ Are all sequences the same length?".format(fasta) raise Exception(exception) return exception align_length = len(align) if align_length < 3: s_i = "STEP7 Done with fasta: {}".format(fasta) s_d = "No transformation performed on fasta {0} \n\ which only contains {1} sequences".format(fasta, align_length) return (s_i, s_d) length = align.get_alignment_length() try: assert overlap_threshold < length except: exception = "[Error]: for fasta {} 'overlap_threshold'\ exceeds the length of the alignment".format(fasta) return (exception, None) # Calculate distance matrix calculate_distance_matrix(fasta, { "input_suffix": in_format, "dna_model": cfg["dna_model"] }) distance_matrix = basename + "_for_dnadist.fasta.distmat" dist = distance_matrix_parser(distance_matrix) dist_dict = {tuple(sorted([x[0], x[1]])): x[2]["distance"] for x in dist} non_empty_seqs = list(set(list(chain(*list(dist_dict.keys()))))) # Find records with alleles that have not been excluded in the settings rec_alleles = [ rec for rec in align if rec.name.split("|")[0] not in taxa_no_allele and rec.name in non_empty_seqs ] align_dict = { rec.name: str(rec.seq) for rec in align if rec.name in non_empty_seqs } taxa = { rec.name: str(rec.seq) for rec in rec_alleles if rec.name in non_empty_seqs } # For each sequence we recode 0 for '-', 1 otherwise, # into a dictionary {name_number: profile}" taxa_profiles = sequence_profiler(taxa) # Find beginning and end for each sequence. # If no/beginning end is found do not include the sequence start_end_dict = { rec: (find_start_end(taxa[rec], kmer)) for rec in taxa if None not in [find_start_end(taxa[rec], kmer)] } # Nbr ovelapping sequences used for calculation overlapping_seq = int(overlap_ratio * len(list(taxa.keys()))) # Dictionary that contains the taxa (i.e. the transcriptomes) as keys # and the names of all sequences corresponding to the key as value. taxa_list = list(set([name.split("|")[0] for name in taxa.keys()])) taxa_contigs = dict(zip(taxa_list, [[] for x in taxa_list])) for rec in taxa: if rec in list(start_end_dict.keys()): taxon = rec.split("|")[0] taxa_contigs[taxon].append(rec) # Dict with items of overlapping_seq as keys and as values the list # of all overlapping seqs. Dict used for finding the seqs from # a different taxon that overlap the sequences from a given taxon. dict_overlap_seq = {} for seq in taxa: dict_overlap_seq[seq] = [ x for x in taxa.keys() if x != seq and (get_overlap_score( tuple(sorted([x, seq])), taxa_profiles) > overlap_threshold) ] # Obtain a dictionary that keep for each allele pair the longest and for # the sequences that are not paired assigns the key 'None'. alleles_dict = {} para_dict = {} for taxon in [x.split("|")[0] for x in taxa.keys()]: if len(taxa_contigs[taxon]) > 1: L, P = assertain_allele( taxa_contigs[taxon], dist_dict, taxa_profiles, dict_overlap_seq, max_allele_dist, overlapping_seq, ) para_dict.update(P) for (k, v) in L: alleles_dict[k] = v else: alleles_dict[taxa_contigs[taxon][0]] = None # Obtain a new dictionary of taxon as key and sequences without alleles # as values. These are the sequences that are not alleles so they could # be assembled into full length transcripts. taxa_contigs_no_alleles = {} for k, v in list(taxa_contigs.items()): taxa_contigs_no_alleles[k] = [ seq for seq in v if seq in list(alleles_dict.keys()) ] # Dict that for each taxon find the seq pairs with overlap below the # threshold and that have a common sequences that overlap both sequences # as the best match as obtained from the distance analysis. taxa_with_pair = {} out_seqs = [] used_seq_in_out = [] for taxon, seqs in list(taxa_contigs_no_alleles.items()): # Case 1. there is only one sequence, so no pair can be infered. if len(seqs) == 1: # Case the unique sequence has an allele pair if alleles_dict[seqs[0]] and len(alleles_dict[seqs[0]]) > 1: out_seqs.extend([(x, align_dict[x].upper().replace("N", "-")) for x in alleles_dict[seqs[0]]]) used_seq_in_out.extend([x for x in alleles_dict[seqs[0]]]) # Case the unique sequence has no allele pair else: out_seqs.append( (seqs[0], align_dict[seqs[0]].upper().replace("N", "-"))) used_seq_in_out.append(seqs[0]) continue # Case 2. there are several sequences, try to find ways to concatenate them. seqs_pairs = list(combinations(seqs, 2)) L = [] for p in seqs_pairs: overlap = get_overlap_score(p, taxa_profiles) start_0, end_0 = start_end_dict[p[0]] start_1, end_1 = start_end_dict[p[1]] s_p = sorted([(p[0], start_0), (p[1], start_1)], key=lambda x: x[1]) sorted_pair = (s_p[0], s_p[1]) if not overlap: seq_coverage = (end_0 - start_0) + (end_1 - start_1) else: seq_coverage = (max(end_0, end_1) - min(end_0, end_1) + min(end_0, end_1) - max(start_0, start_1) + max(start_0, start_1) - min(start_0, start_1)) # If they are little overlap between two seqs of the same taxon # and they share the closest overlapping sequence, create # an edge between the two sequences. if overlap <= overlap_threshold and closest_match( p, taxon, align_dict, taxa_profiles, dist_dict, overlap_threshold, overlap_ratio, alleles_dict, ): L.append(( sorted_pair, { "overlap": overlap, "sequence_coverage": seq_coverage, "start": min(start_0, start_1), }, )) if L: taxa_with_pair[taxon] = sorted(L, key=lambda x: x[1]["start"]) else: # No pair could be calculated for p in seqs_pairs: for item in p: if alleles_dict[item] and len(alleles_dict[item]) > 1: out_seqs.extend([ (x, align_dict[x].upper().replace("N", "-")) for x in alleles_dict[item] ]) used_seq_in_out.extend([x for x in alleles_dict[item]]) else: out_seqs.append( (item, align_dict[item].upper().replace("N", "-"))) used_seq_in_out.append(alleles_dict[item]) continue # Case there are no sequences that can be paired: # Return the original alignment. if not taxa_with_pair: clean_align = [rec for rec in align if rec.name in non_empty_seqs] out_name = basename + out_suffix + ".fasta" SeqIO.write(clean_align, out_name, "fasta") if not os.path.isfile(os.path.join(out_folder, out_name)): shutil.move(out_name, out_folder) s_i = "STEP7 Done with fasta: {}".format(fasta) s_d = "No full size allele could be reconstructed for fasta: {}".format( fasta) return (s_i, s_d) # Case some sequences are paired: # 1. Generate all possible path between nodes that have indegrees of one # and nodes with with outdegree of one. # 2. The path are scored according to their length in term of number # of nucleotids minus the number of weighted mismaches, minus # the number of weighted overlap. # 3. The best path is selected. # 4. Then new path are recalculated by removing the selected terminal # nodes and recalculating the path. Adding then to the path pool # for selection. # 5. Continue until all nodes have been used at least once. for taxon in list(taxa_with_pair.keys())[:]: G = nx.DiGraph() G.add_edges_from([(x[0][0], x[0][1], x[1]) for x in taxa_with_pair[taxon]]) in_nodes = [k for k, v in list(G.in_degree(G.nodes())) if v == 0] out_nodes = [k for k, v in list(G.out_degree(G.nodes())) if v == 0] # Initialize the search procedure with the first path possible_paths = [] for item in product(in_nodes, out_nodes): possible_paths.extend( nx.all_simple_paths(G, source=item[0], target=item[1])) used_seqs = [] possible_path_dict = {} for path in possible_paths: score = (get_path_length(path, start_end_dict) - overlap_cost * count_mismatches(path, align)[0] - mistmatch_cost * count_mismatches(path, align)[1]) possible_path_dict[tuple(path)] = score best_path = sorted(list(possible_path_dict.items()), key=lambda x: x[1])[-1] sequences_best_path = [y for y in best_path[0]] all_best_paths = [best_path] remaining_seqs = [ x for x in G.nodes() if x not in [y[0] for y in best_path[0]] ] # Remove the sequences of the best path from the graph only # if there is a paralog sequence that is overlapping them. for seq in sequences_best_path: used_seqs.append(seq[0]) if seq[0] in list(para_dict.keys()): G.remove_node(seq) for path in list(possible_path_dict.keys()): if seq in path: del possible_path_dict[path] remaining_seqs = list(G.nodes()) while len(remaining_seqs) > 1: # Retrieve 5' and 3' sequences. in_nodes = [k for k, v in list(G.in_degree(G.nodes())) if v == 0] out_nodes = [k for k, v in list(G.out_degree(G.nodes())) if v == 0] if in_nodes == out_nodes: break # Find all path that originate in one in_node and end in one out_node possible_paths = [] for item in product(in_nodes, out_nodes): possible_paths.extend( nx.all_simple_paths(G, source=item[0], target=item[1])) for path in possible_paths: score = (get_path_length(path, start_end_dict) - overlap_cost * count_mismatches(path, align)[0] - mistmatch_cost * count_mismatches(path, align)[1]) possible_path_dict[tuple(path)] = score # Select the path with the best score: # if the putative path is entirely included in a previously # selected path, it is rejected and the loop ends for path in sorted(list(possible_path_dict.items()), key=lambda x: x[1], reverse=True): for b_path in [set(x[0]) for x in all_best_paths]: if set(path[0]).intersection(b_path) == set(path[0]): best_path = None break if path not in all_best_paths and best_path is not None: best_path = path break else: best_path = None if best_path is None: break all_best_paths.append(best_path) sequences_best_path = [y for y in best_path[0]] # Switch that records whether the path is entirely made of # sequences lacking paralogs no_paralogue = True # 1- Remove all sequences in the best path that have a paralog for seq in sequences_best_path: used_seqs.append(seq[0]) if seq[0] in list(para_dict.keys()): G.remove_node(seq) used_seqs.append(seq) no_paralogue = False for path in list(possible_path_dict.keys()): if seq in path: del possible_path_dict[path] # 2- Remove all sequences in the path if only made out of # sequences lacking a paralog if no_paralogue: for seq in sequences_best_path: G.remove_node(seq) used_seqs.append(seq) for path in list(possible_path_dict.keys()): if seq in path: del possible_path_dict[path] remaining_seqs = G.nodes() if remaining_seqs and remaining_seqs[0][0] not in used_seqs: all_best_paths.extend([((remaining_seqs[0], ), None)]) all_best_paths_seq = [] for x in all_best_paths: all_best_paths_seq.append([y[0] for y in x[0]]) final_paths = [] for path in all_best_paths_seq: final_paths.append(path) alleles_path = [] # Switch that ensures that there is at least one seq with alleles switch = False for p in path: if alleles_dict[p] is None and len(path) > 1: alleles_path.append(p) elif alleles_dict[p] is not None: alternative_allele = [x for x in alleles_dict[p] if x != p] alleles_path.extend(alternative_allele) switch = True if switch: final_paths.append(alleles_path) for path in final_paths: used_seq_in_out.extend(path) out_seqs.append(make_consensus(path, align)) # Add sequences that have not been included in any path for item in list( set([x for x in align_dict.keys() ]).difference(set(used_seq_in_out))): if item not in [x[0] for x in out_seqs]: out_seqs.append((item, align_dict[item].replace("N", "-"))) out_seq_newname = zip( [ rename(x[0]) for x in sorted(list(set(out_seqs)), key=lambda x: x[0]) ], [x[1] for x in sorted(list(set(out_seqs)), key=lambda x: x[0])], ) final_records = [] for name, seq in out_seq_newname: final_records.append( SeqRecord(Seq(seq, IUPAC.IUPACAmbiguousDNA()), name=name, id=name, description="")) out_name = basename + out_suffix + ".fasta" SeqIO.write(final_records, out_name, "fasta") if not os.path.isfile(os.path.join(out_folder, out_name)): shutil.move(out_name, out_folder) s_i = "STEP7 Done with fasta: {}".format(fasta) s_d = "Alignment size was reduced from {0} to {1} sequences".format( len(align), len(final_records)) return (s_i, s_d)