def standardCassette(PromoterName, TerminatorName, orfName, orfSeq): #first, the promoter print( "I'm going to build a standard cassette in which promoter is 600nt, terminator 250nt." ) print("First, which PROMOTER do you want to use, e.g., TDH3") PromoterGeneRec = fetchGene(PromoterName) PromoterRec = fetchNeighbor(PromoterGeneRec, "upstream", 600) PromoterRec.id = PromoterRec.id + "ps" #second, the terminator print("Which TERMINATOR do you want to use, e.g., ADH1") TerminatorGeneRec = fetchGene(TerminatorName) TerminatorRec = fetchNeighbor(TerminatorGeneRec, "downstream", 250) TerminatorRec.id = TerminatorRec.id + "ts" #and last, the gene print("What is the name of your gene, e.g., KlGapDH") print("What's the sequence") orfRecord = SeqRecord(Seq(orfSeq, SingleLetterAlphabet()), id=orfName) insertRec = [PromoterRec, orfRecord, TerminatorRec] return PromoterRec, orfRecord, TerminatorRec
def editExisting(name, option, promoter=None, terminator=None, NewGeneName="", NewGeneSeq=""): OrigGeneRecord = fetchGene(name) UpHomRec = fetchNeighbor(OrigGeneRecord, "upstream", HomologyLength) DownHomRec = fetchNeighbor(OrigGeneRecord, "downstream", HomologyLength) if option == 1: fragments = [UpHomRec, DownHomRec] elif option == 2: InsertRec = SeqRecord(Seq(NewGeneSeq, SingleLetterAlphabet()), id=NewGeneName) fragments = [UpHomRec, InsertRec, DownHomRec] elif option == 3: PromoterRec, orfRecord, TerminatorRec = standardCassette( promoter, terminator, NewGeneName, NewGeneSeq) fragments = [ UpHomRec, PromoterRec, orfRecord, TerminatorRec, DownHomRec ] elif option == 4: pass elif option == 5: pass return stitch(fragments)
def create_new_sequence_record(seq_record): # get new sequence values new_id = mod_read_id(seq_record) new_seq = reduce_seq_length(seq_record) if new_seq is None: print("Warning: Sequence less than 30 characters") return None phred_quality = list(seq_record.letter_annotations["phred_quality"]) phred_quality = phred_quality[0:30] # Phred score less than 30 for score in phred_quality: if score < 30: # print("Warning: PHRED scroe lower than 30 - discarding - ", score) return None # construct a new sequence record new_seq_record = SeqRecord(Seq(str(new_seq), SingleLetterAlphabet()), id=new_id, name=seq_record.name, description=seq_record.description, dbxrefs=seq_record.dbxrefs, features=seq_record.features, annotations=seq_record.annotations) new_seq_record.letter_annotations["phred_quality"] = phred_quality return new_seq_record
def _clean_DNA_seq(self, record): ''' Exchange all X in sequence with N :param record: Biopython SeqRecord object :return: Biopython seq object with Ns instead of Xs ''' return Seq.Seq(re.sub('[^GATC]', 'N', str(record.seq).upper()), SingleLetterAlphabet())
def make_fpa_fasta(genenamestr, genenamelist): PSDO = [] FUNC = [] if genenamelist: for allele in genenamelist: if any(c in allele.description for c in ('|F|', '|(F)|', '|[F]|', '|ORF|', '|(ORF)|', '|[ORF]|')): FUNC.append(allele) if any(c in allele.description for c in ('|P|', '|(P)|', '|[P]|')): PSDO.append(allele) if "TRBC" in genenamestr: cdict = dict() for exons in FUNC: all_name = exons.id[exons.id.find("|") + 1:nth_occur(exons.id, "|", 2)] rdict(all_name, exons, cdict) FUNC = [] exn_order = dict() for n in xrange(len(cdict.values())): for seqs in cdict.values( )[n]: #arrange exons according to their order i = int( seqs.description[nth_occur(seqs.description, "|", 5) - 1]) exn_order[i] = seqs #make joined sequence and new description ntseq = [] ntpos = [] ntnt = [] for j in sorted(exn_order.keys()): ntseq.append(str(exn_order[j].seq)) ntpos.append(exn_order[j].description[ nth_occur(exn_order[j].description, "|", 5) + 1:nth_occur(exn_order[j].description, "|", 6)]) ntnt.append(exn_order[j].description[ nth_occur(exn_order[j].description, "|", 6) + 1:nth_occur(exn_order[j].description, "|", 7) - 2]) seq = "".join(ntseq)[1:] position = ";".join(ntpos) region = "C-REGION" nt = "+".join(ntnt).replace(" ", "") fle = SeqRecord( Seq(seq, SingleLetterAlphabet()), id=cdict.values()[n][0].id, name=cdict.values()[n][0].name, description="|".join([ cdict.values()[n][0].description[:nth_occur( cdict.values()[n][0].description, "|", 4)], region, position, nt, " | | | | | | | |" ]), dbxrefs=cdict.values()[n][0].dbxrefs) FUNC.append(fle) SeqIO.write(FUNC, "extdata/%s_F.fasta" % genenamestr, "fasta") SeqIO.write(PSDO, "extdata/%s_P.fasta" % genenamestr, "fasta") SeqIO.write(genenamelist, "extdata/%s.fasta" % genenamestr, "fasta")
def variableCassette(geneList, seqList, toVary="", variants=[], variantSeq=[]): # Store both name and sequence in a SeqRecord # Append them to a list # Return list as fragments to be stitched if toVary != "": toVary = int(toVary) records = [] counter = 0 for gene in geneList: name = gene sequence = seqList[counter] Rec = SeqRecord(Seq(sequence, SingleLetterAlphabet()), id=str(counter + 1)) Rec.name = name records.append(Rec) counter += 1 variantRecords = [] variantRecords.append(records) # Executes if variants is not empty counter = 0 if variants != []: for variant in variants: name = variant sequence = variantSeq[counter] Rec = SeqRecord(Seq(sequence, SingleLetterAlphabet()), id=str(counter + 1)) Rec.name = name # Make a copy of the original, switch the fragments and add it to the list. # Deep-copy ensures there are no pointer issues tempVariant = copy.deepcopy(records) tempVariant[toVary - 1] = Rec variantRecords.append(copy.deepcopy(tempVariant)) counter += 1 # Returns a list of lists of the answers. answer = [[stitch(variantRecords[0])]] variants = [] for n in range(len(variantRecords) - 1): frags = variantRecords[n + 1][toVary - 2:toVary] variantStitch = [stitch(frags)] answer.append(variantStitch) return answer
def concatenate_fasta(args): for fasta_file in glob.glob(args.input): concat = Seq.Seq("", SingleLetterAlphabet()) for s in SeqIO.parse(fasta_file, 'fasta'): concat += s print(fasta_file) concat.id = fasta_file concat.description = "" SeqIO.write(concat, args.output, 'fasta')
def _cleanAli2(recordNuc, omit, fileName, stage): handleP = open('tAligned.fas', 'rU') records = list(SeqIO.parse(handleP, 'fasta')) store = list() for rec in records: if "gi|" in rec.id or "Homo_sapiens" in rec.id: n_count_s = rec.seq[:3].count("N") n_count_e = rec.seq[-3:].count("N") break #print records #print recordNuc for i, rec in enumerate(records): nucData = [x.seq for x in recordNuc if x.id in rec.id] nucSeqData = _spliter(nucData[0], 3) if stage == "mapper": nucSeqData[0] = nucSeqData[0].lstrip("N") nucSeqData[-1] = nucSeqData[-1].rstrip("N") sequence = Seq("", SingleLetterAlphabet()) pos = 0 for j, amino in enumerate(rec.seq): if amino == '-': sequence = sequence + Seq("---", SingleLetterAlphabet()) elif amino == "Z": sequence = sequence + Seq("NNN", SingleLetterAlphabet()) pos = pos + 1 else: sequence = sequence + nucSeqData[pos] pos = pos + 1 records[i].seq = Seq(str(sequence), SingleLetterAlphabet()) with open(fileName, 'w') as fp: SeqIO.write(records, fp, "fasta") os.remove('translated.fas') os.remove('tAligned.fas')
def record_from_indices(record, indices): '''Given a list of integers and a sequence record, will create a new record corresponding to the indices specified.''' new_seq = Seq(''.join([record[i] for i in indices]), SingleLetterAlphabet()) new_record = SeqRecord(new_seq, record.id) new_record.description, new_record.name = record.id, record.id if record.letter_annotations: new_annotations = [ record.letter_annotations['phred_quality'][i] for i in indices ] new_record.letter_annotations['phred_quality'] = new_annotations return new_record
def convert_a2m(ali): fh = cStringIO.StringIO(ali) msa = AlignIO.read(fh, 'fasta') fh.close() new_msa = [] for rec in msa: new_seq = Seq(re.sub(r'[a-z.]', '', str(rec.seq)), SingleLetterAlphabet()) new_rec = rec new_rec.seq = new_seq new_msa.append(new_rec) new_msa = MultipleSeqAlignment(new_msa) return new_msa.format('fasta')
def add_sequences(input_df, seqrecords): """ It modifies seqrecords by appending the new sequences. """ for row in input_df.itertuples(): name = '{}:{}'.format(row.Species, row.GeneID) seqrecords.append( SeqRecord(Seq(row.NucleotideSequence, SingleLetterAlphabet()), id=name, name=name, description='{}:{} {} na:na:na:{}:{}:{}'.format( row.Species, row.GeneID, row.ExonID, row.ExonRegionStart, row.ExonRegionEnd, row.Strand)))
def _cleanAli(recordNuc, omit, fileName): handleP = open('tAligned.fas', 'rU') records = list(SeqIO.parse(handleP, 'fasta')) store = list() for i, rec in enumerate(records): nucData = [x.seq for x in recordNuc if x.id in rec.id] nucSeqData = _spliter(nucData[0], 3) sequence = Seq("", SingleLetterAlphabet()) pos = 0 #print len([x for x in rec.seq if x!="-"]), len(nucSeqData) for j, amino in enumerate(rec.seq): if amino == '-': sequence = sequence + Seq("---", SingleLetterAlphabet()) elif amino == "Z": sequence = sequence + Seq("NNN", SingleLetterAlphabet()) pos = pos + 1 else: if pos == 0 or pos == len(nucSeqData) - 1: sequence = sequence + nucSeqData[pos].strip("N") else: sequence = sequence + nucSeqData[pos] pos = pos + 1 records[i].seq = Seq(str(sequence).strip("N"), SingleLetterAlphabet()) optimal_length = manage_seqLength([len(rec.seq) for rec in records]) for i, rec in enumerate(records): rec.seq = rec.seq[:optimal_length] with open(fileName, 'w') as fp: SeqIO.write(records, fp, "fasta") os.remove('translated.fas') os.remove('tAligned.fas')
def editEmpty(name, sequence, cutname, promoter=None, terminator=None): df = pd.read_excel(os.path.join(PROJECT_ROOT, "cutsites.xlsx")) labels = df['name'].values ChrLetters = df['chrom. loc.'].values ExpValues = df['exp. lev.'].values cutSeqs = df['sequence'].values cutArray = { 'name': Series(labels, index=labels), 'exp. lev.': Series(ExpValues, index=labels), 'chrom. loc.': Series(ChrLetters, index=labels), 'sequence': Series(cutSeqs, index=labels) } cutFrame = DataFrame(cutArray) location = cutFrame.loc[cutname, 'chrom. loc.'] + ".fasta" cutSequence = cutFrame.loc[cutname, 'sequence'] ChromosomeSeq = SeqIO.read( os.path.join(PROJECT_ROOT, "chromosomes\\" + location), "fasta").seq if ChromosomeSeq.find(cutSequence) == -1: ChromosomeSeq = ChromosomeSeq.reverse_complement() StartIndex = ChromosomeSeq.find(cutSequence) EndIndex = StartIndex + 34 UpSeq = ChromosomeSeq[StartIndex - HomologyLength:StartIndex] DownSeq = ChromosomeSeq[EndIndex:EndIndex + HomologyLength] UpHomRec = SeqRecord(UpSeq, id=cutname) DownHomRec = SeqRecord(DownSeq, id=cutname) orfRecord = SeqRecord(Seq(sequence, SingleLetterAlphabet()), id=name) if promoter is None: fragments = [UpHomRec, orfRecord, DownHomRec] else: PromoterGeneRec = fetchGene(promoter) PromoterRec = fetchNeighbor(PromoterGeneRec, "upstream", 600) PromoterRec.id = PromoterRec.id + "ps" TerminatorGeneRec = fetchGene(promoter) TerminatorRec = fetchNeighbor(TerminatorGeneRec, "upstream", 600) TerminatorRec.id = TerminatorRec.id + "ts" fragments = [ UpHomRec, PromoterRec, orfRecord, TerminatorRec, DownHomRec ] return stitch(fragments)
def test_genemap_fasta_gene1(self): record_1 = SeqRecord(seq=Seq('AAAA', SingleLetterAlphabet()), id='sample1__Brandomstuff__gene1__1', name='sample1__Brandomstuff__gene1__1', description='sample1__Brandomstuff__gene1__1', dbxrefs=[]) record_2 = SeqRecord(seq=Seq('CCCC', SingleLetterAlphabet()), id='sample2__Arandomstuff__gene1__1', name='sample2__Arandomstuff__gene1__1', description='sample2__Arandomstuff__gene1__1', dbxrefs=[]) expected = {'gene1': [record_1, record_2]} result = dict(sequence_split.make_genemap(self.gene1, '__', 2, 'fasta')) ''' BioSeq.Seq objects don't have proper __eq__ comparison implemented, so have to comapre __dict__ ''' def seqs_equal(seq1, seq2): return str(seq1.seq) == str( seq2.seq ) and seq1.id == seq2.id and seq1.name == seq2.name and seq1.description == seq2.description for key in expected.keys(): self.assertTrue(key in result.keys()) self.assertTrue(seqs_equal(expected[key][0], result[key][0]))
def capitalize_seqs(input_fasta, output_fasta, filetype='fasta'): """Capitalizes the ATGC sequence in a fasta file and writes it to a new file. :param input_fasta: Filepath to the input fasta file to capitalize. :param output_fasta: Filepath to the output fasta file. :param filetype: The file format to read and write. Either 'fasta' or 'fastq' :return: Filepath to the output fasta file. """ capitalized_output_file = BufferedSeqWriter(output_fasta, filetype) for sequence in SeqIO.parse(open(input_fasta, 'rU'), "fasta"): sequence.seq = Seq(str(sequence.seq).upper(), SingleLetterAlphabet()) capitalized_output_file.write(sequence) capitalized_output_file.flush() return output_fasta
def main(): args = cli(sys.argv[0], sys.argv[1:]) for infile in args.infiles: alignments = AlignIO.read(infile, format="fasta", alphabet=Gapped(SingleLetterAlphabet(), "-")) id_ = os.path.split(os.path.splitext(infile.name)[0])[-1] fmt = alignments.format("stockholm").split("\n", maxsplit=1) args.outfile.write(fmt[0]) args.outfile.write(f"\n#=GF ID {id_}\n") args.outfile.write(fmt[1]) return
def clusters_alignment(file): full_alignment = starting_pt(file) clusters = hawk_wrap(file) consensus_list = [] cluster_index = [] for x in range(len(clusters)): if len(clusters[x]) > 1: cluster_index.append(x) while clusters: current_cluster = clusters.pop(0) if len(current_cluster) > 1: multiple_seqs = [full_alignment[x] for x in current_cluster] aligned_multiples = alignment_wrap(multiple_seqs) subprocess.run([ "em_cons", "/home/god/Documents/oldHawkEye/very_unlikely_to_be_called_this.fasta", "/home/god/Documents/oldHawkEye/very_unlikely_consensus.cons" ]) with open( "/home/god/Documents/oldHawkEye/very_unlikely_consensus.cons" ) as consensus: seq = consensus.read() con = [] dash = 'n' R_DNA = ['A', 'C', 'T', 'G', 'U'] for i in seq.split(): for j in i: if j in R_DNA: con.append(j) elif j not in R_DNA: pass con_str = ''.join(str(k) for k in con) consensus.close() simple_seq_r = SeqRecord(Seq(con_str, SingleLetterAlphabet()), id="CLUSTER" + str(cluster_index.pop(0))) consensus_list.append(simple_seq_r) elif len(current_cluster) == 1: single_seq = [full_alignment[x] for x in current_cluster] consensus_list.append(single_seq.pop(0)) SeqIO.write(consensus_list, "very_unlikely_to_be_called_this.fasta", "fasta") file = "very_unlikely_to_be_called_this.fasta" in_file = "/home/god/Documents/oldHawkEye/" + file mafft_cline = MafftCommandline(input=in_file) stdout, stderr = mafft_cline() handle = open(file, "w") handle.write(stdout) handle.close()
def mask_seq(seq, start, end, id, length): '''Replaces the coords between start and end with N''' masked = [] insert = Seq("N" * length, SingleLetterAlphabet()) # Find contig for seq_record in seq: if seq_record.id == id: newseq = seq_record.seq[:start - 1] + insert + seq_record.seq[end:] #seq_record.seq = newseq masked.append( SeqRecord(id=seq_record.id, description=seq_record.description, seq=newseq)) else: masked.append(seq_record) return masked
def convert_sequence_file_format(input_filepath, input_format, output_format, output_filename=None): """ Converts an sequence file specified in the 'input_format' argument in an alignment file in the format specified in the 'output_format'. """ input_file_basename = os.path.basename(input_filepath) input_file_name = os.path.splitext(input_file_basename)[0] if not output_filename: output_file_basename = "%s.%s" % ( input_file_name, pymod_vars.alignment_extensions_dictionary[output_format]) else: output_file_basename = "%s.%s" % ( output_filename, pymod_vars.alignment_extensions_dictionary[output_format]) output_file_handler = open( os.path.join(os.path.dirname(input_filepath), output_file_basename), "w") if input_format == "pymod": input_file_handler = open(input_filepath, "r") records = [ SeqRecord(Seq(l.split(" ")[1].rstrip("\n\r")), id=l.split(" ")[0]) for l in input_file_handler.readlines() ] else: input_file_handler = open(input_filepath, "r") records = list( SeqIO.parse(input_file_handler, input_format, alphabet=SingleLetterAlphabet())) if output_format == "pymod": lines = [] for i in [(rec.id, rec.seq) for rec in records]: lines.append(str(i[0]) + '\n') lines.append(str(i[1]) + '\n') output_file_handler.writelines(lines) else: SeqIO.write(records, output_file_handler, output_format) input_file_handler.close() output_file_handler.close()
def _get_seqlist(self, slist, gdict): if len(slist)==0: return Seq('',alphabet=SingleLetterAlphabet()) if self.strand() == '+': iseq = slist[0].sequence(gdict).upper() for i,p1 in enumerate(slist[1:]): p0 = slist[i] assert not p1.start < p0.end, "error: overlapping intervals:\n%s\n%s" % (p0,p1) if p1.start > p0.end: iseq += gdict[p1.chrom][p0.end:p1.start].lower() iseq += p1.sequence(gdict).upper() else: iseq = slist[0].sequence(gdict).upper() for i,p1 in enumerate(slist[1:]): p0 = slist[i] assert not p0.start < p1.end, "error: overlapping intervals:\n%s\n%s" % (p0,p1) if p0.start > p1.end: iseq += gdict[p1.chrom][p1.end:p0.start].reverse_complement().lower() iseq += p1.sequence(gdict).upper() return iseq
def filtergen( file ): # generator function that returns edited reads that pass filter, to write new fastq file for record in SeqIO.parse(file, "fastq"): # Convert base qualities to Boolean based on Qscore threshold value. Only use reads with >=50% non-N: recordqual = [ x > Qscore_threshold for x in record.letter_annotations['phred_quality'] ] # list of True, False etc if float(sum(recordqual)) / float( len(recordqual )) >= .5: # note that True = 1, False = 0 for summing # generates new read sequence where all bases < threshold is switched to 'N' seq = "".join( [y if x else 'N' for (x, y) in zip(recordqual, record.seq)]) # create new SeqRecord with edited read sequence newrec = SeqRecord(Seq(seq, SingleLetterAlphabet()), id=record.id, name=record.name, description=record.description, letter_annotations=record.letter_annotations) yield newrec
def __extractReads(self, indexVal, howLong, iterator): for record in iterator: read = self.__getSeq(record, indexVal, howLong) if len(read) > 0: readID, contig = re.match("^(\d+)-(contig\d+)$", record.id).groups() r = self.readInfo[readID] #>58526338-contig00001-33057/1; KO:K00927 start: 575 offset: 287 header = "%s-%s/%s\tKO:%s\tstart:%s\toffset:%s" % ( record.id, r['taxa'], r['readnum'], self.ko, indexVal, howLong) newseq = Seq( str(record.seq).upper().translate( {ord(i): None for i in '-'}), SingleLetterAlphabet()) newrecord = SeqRecord(newseq, id=header, name="", description="") self.outputRecords.append(newrecord) self.readInfo[readID]['readnum'] += 1 return
def combine_sequence(in_fasta, threshold): record_dict = SeqIO.index(in_fasta, "fasta") # index the record # initiate an empty Sequence string combined_string = Seq("", SingleLetterAlphabet()) # fasta_dic out_fasta_dic = {} # dictionary map index of each contig out_map_dic = {} start = 0 count = 0 for record in sorted(record_dict): count += 1 new_contig = record_dict[record].seq if len(new_contig) >= threshold: length = len(new_contig) combined_string += new_contig out_map_dic[record_dict[record].id] = [start, start + length] start += length # increment the start position if count == 1: out_fasta_dic["id"] = record_dict[record].id out_fasta_dic["description"] = record_dict[record].description out_fasta_dic["sequence"] = combined_string return out_fasta_dic, out_map_dic
def extract_data(rdir, LVEXON): #make /extdata directory final_directory = makerdir('/extdata') if not os.path.exists(final_directory): os.makedirs(final_directory) #read fasta file rawdata = list(SeqIO.parse(rdir, "fasta")) h**o = [] for element in rawdata: if 'H**o' in element.id: h**o.append(element) TRB = [] for element in h**o: if 'TRB' in element.id: TRB.append(element) TRBV = [] TRBD = [] TRBJ = [] TRBC = [] TRBL = [] for element in TRB: if 'TRBV' in element.id: if 'L-PART1+L-PART2' in element.description: TRBL.append(element) else: TRBV.append(element) elif 'TRBD' in element.id: TRBD.append(element) elif 'TRBJ' in element.id: TRBJ.append(element) elif 'TRBC' in element.id: TRBC.append(element) make_fpa_fasta("TRBV", TRBV) make_fpa_fasta("TRBD", TRBD) make_fpa_fasta("TRBJ", TRBJ) make_fpa_fasta("TRBC", TRBC) make_fpa_fasta("TRBL", TRBL) lead = list(SeqIO.parse(LVEXON, "fasta")) TRBV_F = list(SeqIO.parse("extdata/TRBV_F.fasta", "fasta")) TRBV_P = list(SeqIO.parse("extdata/TRBV_P.fasta", "fasta")) TRBV = TRBV_F + TRBV_P matchedV = [] match = dict() TRBLV = [] TRBLV_F = [] TRBLV_P = [] nostartcodon = 0 yesmatch = 0 seqbtwLV = 0 for allele in TRBV: for lvsq in lead: if allele.id[allele.id.find("TRB"):allele.id.find("*") + 3] in lvsq.id: yesmatch += 1 if allele.seq in lvsq.seq[-len(allele.seq):]: if str(lvsq.seq)[:3] == "atg": temp = [ str(lvsq.seq[:-len(allele.seq)]), str(allele.seq) ] length = "{0}+{1} nt".format(len(temp[0]), len(temp[1])) new_description = allele.description[:nth_occur( allele.description, "|", 6 ) + 1] + length + allele.description[ nth_occur(allele.description, "|", 7):] TRBLV.append( SeqRecord(Seq(str("".join(temp)), SingleLetterAlphabet()), description=new_description, id=lvsq.id, name=lvsq.name, dbxrefs=lvsq.dbxrefs)) rdict( allele.id[allele.id.find("TRB"):allele.id.find("*" )], str(lvsq.seq[:-len(allele.seq)]), match) matchedV.append(allele) else: nostartcodon += 1 else: seqbtwLV += 1 mismatch = set(TRBV) - set(matchedV) #for V alleles with no matching L, use other allele's L sequence for allele in mismatch: genename = allele.id[allele.id.find("TRB"):allele.id.find("*")] for key in match: if genename == key: temp = [str(most_common(match[key])), str(allele.seq)] length = "{0}+{1} nt".format(len(temp[0]), len(temp[1])) new_description = allele.description[:nth_occur( allele.description, "|", 6 ) + 1] + length + allele.description[ nth_occur(allele.description, "|", 7):] TRBLV.append( SeqRecord(Seq("".join(temp), SingleLetterAlphabet()), description=new_description, id=allele.id, name=allele.name, dbxrefs=allele.dbxrefs)) matchedV.append(allele) mismatch = set(TRBV) - set(matchedV) n = [] for element in mismatch: n.append(element.description[:element.description.find("*") + 3]) make_fpa_fasta("TRBLV", TRBLV)
def basicVariantCall( pileUp, ref=None ): # This function will serve as a basic variant call method to be used in # our single ref. pipelines. It will remove all singleton sites (with # no mapping ambiguity). from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import SingleLetterAlphabet from itertools import izip from scipy.stats import iqr # import numpy as np mapQC = [] summaryStats = [] consensusContigs = [] variants = [] for refName,counts,inserts in pileUp: outName = refName.split("|")[-1].replace('/','_') tmpDict = defaultdict(list) tmpDict2 = defaultdict(int) tmpDict['name'] = outName tmpDict['variant'] = [np.nonzero(counts[:,n])[0].tolist() for n in xrange(counts.shape[1])] tmpDict['cov'] = [counts[np.nonzero(counts[:,n])[0],n].tolist() for n in xrange(counts.shape[1])] siteCov = np.sum(counts,axis=0) tmpDict2['avg_cov'] = np.median( siteCov ) tmpDict2['std_cov'] = (20.0*iqr( siteCov )) / 27.0 tmpDict2['num_uncov'] = np.sum(siteCov == 0) tmpDict2['contig_len'] = len(siteCov) frac = counts[:,siteCov>0] frac /= (1.*frac.sum(axis=0,keepdims=True)) # print np.sum(frac.sum(axis=0,keepdims=True) ==0) S = -np.log(frac+.0001) * frac tmpDict2['site_entropy'] = np.mean(np.sum(S,axis=0)) serialIndel = inserts for outKey in serialIndel.keys(): for inKey in serialIndel[outKey].keys(): serialIndel[outKey][inKey] = np.asscalar(serialIndel[outKey][inKey]) tmpDict['indel'] = serialIndel mapQC.append(tmpDict) summaryStats.append(tmpDict2) # Save consensus sequence (where it differs from reference) as well as locations where variation # has been detected. consensusSeq = [] variableSites = defaultdict(lambda: defaultdict(list)) alphabet = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: '-', 5: 'N'} for n,seq in enumerate(tmpDict['variant']): if (len(seq) > 1): variableSites[n]['nuc'] = [alphabet[s] for s in seq] variableSites[n]['cov'] = tmpDict['cov'][n] # If there are more than one detected variant, store consensusSeq.append(alphabet[seq[np.argmax(tmpDict['cov'][n])]]) elif (len(seq) == 0): consensusSeq.append('-') else: consensusSeq.append(alphabet[seq[0]]) consensusContigs.append(Seq(''.join(consensusSeq),SingleLetterAlphabet())) variants.append(variableSites) # refSeq = [] # with open(ref,'r') as refFile: # for contig in SeqIO.parse(refFile,'fasta'): # refSeq.append(contig.seq) # Store consensus sequence (as a variant of the reference to save space) conSeq = defaultdict(str) # for C in xrange(len(refSeq)): # for n,(rN,cN) in enumerate(izip(refSeq[C],consensusContigs[C])): # if (cN != 'N' and rN != cN): # conSeq[n] = cN return mapQC,summaryStats,variants,conSeq
def guess_alphabet(sequence:str): ''' This function guesses the alphabet of a string representing a biological sequence. ''' import string from Bio.Alphabet import SingleLetterAlphabet from Bio.Alphabet import NucleotideAlphabet from Bio.Alphabet import ProteinAlphabet from Bio.Alphabet.IUPAC import extended_protein from Bio.Alphabet.IUPAC import protein from Bio.Alphabet.IUPAC import ambiguous_dna from Bio.Alphabet.IUPAC import unambiguous_dna from Bio.Alphabet.IUPAC import extended_dna from Bio.Alphabet.IUPAC import ambiguous_rna from Bio.Alphabet.IUPAC import unambiguous_rna if len(sequence)<1: return SingleLetterAlphabet() for c in sequence: if c not in string.printable: return SingleLetterAlphabet() xp = set(extended_protein.letters) pr = set(protein.letters) ad = set(ambiguous_dna.letters) ud = set(unambiguous_dna.letters) ed = set(extended_dna.letters) ar = set(ambiguous_rna.letters) ur = set(unambiguous_rna.letters) all = xp|pr|ad|ud|ed|ar|ur sequence_chars = set(sequence.upper()) if sequence_chars - all - set(string.punctuation+string.whitespace): return SingleLetterAlphabet() nucleic_count = 0 for letter in "GATCUNgatcun": nucleic_count += sequence.count(letter) if float(nucleic_count) / float(len(sequence)) >= 0.9: # DNA or RNA if 'T' in sequence_chars and 'U' in sequence_chars: alphabet = NucleotideAlphabet() elif not sequence_chars-ud: alphabet = unambiguous_dna elif not sequence_chars-ad : alphabet = ambiguous_dna elif not sequence_chars-ed: alphabet = extended_dna elif not sequence_chars-ur: alphabet = unambiguous_rna elif not sequence_chars-ar: alphabet = ambiguous_rna else: alphabet = NucleotideAlphabet() else: threecode = ['ALA', 'ASX', 'CYS', 'ASP','GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET','ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP','TYR', 'GLX', 'XAA', 'TER', 'SEL', 'PYL', 'XLE'] tc=set(threecode) three_letter_alphabet = set( [ sequence[i:i+3] for i in range(0,len(sequence),3)] ) if not three_letter_alphabet - tc: alphabet = "three letter code" elif sequence_chars - pr: alphabet = protein elif sequence_chars - xp: alphabet = extended_protein else: alphabet = ProteinAlphabet() return alphabet
if trna_mod_row['Organellum'] != 'cytosolic': continue key = (trna_mod_row['Amino acid'], trna_mod_row['Anticodon (Canonical)']) if key not in trna_mods: trna_mods[key] = [] trna_mods[key].append({ 'id': trna_mod_row['Id'], 'can': trna_mod_row['Sequence (Canonical)'], 'nc': trna_mod_row['Sequence'], }) trna_seq_rows = pandas.read_excel('examples/homo_sapiens_rna/summary.xlsx', sheet_name='tRNA seqs - Gogakos et al.', header=[0, 1]) trna_seqs = [] alphabet = SingleLetterAlphabet() for _, trna_seq_row in trna_seq_rows.iterrows(): aa = list(trna_seq_row.items())[0][1] anticodon = list(trna_seq_row.items())[1][1] key = (aa, anticodon) if key not in trna_mods: continue # find most similar sequences in MODOMICS best_id = None best_can_seq = None best_seq = None best_alignment = None best_score = -float('inf') for trna_mod in trna_mods[key]: alignment = pairwise2.align.globalxs(
def clusters_alignment(file): full_alignment = starting_pt(file) clusters = hawk_wrap(file) consensus_list = [] cluster_index = [] if type(clusters[0]) == list: for x in range(len(clusters)): if len(clusters[x]) > 1: cluster_index.append(x) while clusters: if type(clusters[0]) == list: current_cluster = clusters.pop(0) if len(current_cluster) > 1: multiple_seqs = [full_alignment[x] for x in current_cluster] with tempfile.NamedTemporaryFile() as alignment_file: SeqIO.write(multiple_seqs, alignment_file.name, "fasta") with tempfile.NamedTemporaryFile() as consensus_file: subprocess.call([ "em_cons", alignment_file.name, consensus_file.name ]) seq = consensus_file.name data = open(seq).read() con = [] dash = 'n' R_DNA = [ 'A', 'a', 'C', 'c', 'T', 't', 'G', 'g', 'U', 'u' ] for i in data.split(): for j in i: if j in R_DNA: con.append(j) elif j not in R_DNA: pass con_str = ''.join(str(k) for k in con) cluster_seq_id = [] while multiple_seqs: seq_info = multiple_seqs.pop(0) cluster_seq_id.append(seq_info.id) seqid_string = '|'.join(str(l) for l in cluster_seq_id) simple_seq_r = SeqRecord(Seq(con_str, SingleLetterAlphabet()), id="CLUSTER_" + str(cluster_index.pop(0)) + ": " + seqid_string) consensus_list.append(simple_seq_r) elif len(current_cluster) == 1: single_seq = [full_alignment[x] for x in current_cluster] consensus_list.append(single_seq.pop(0)) elif type(clusters[0]) == int: noSaturation = alignment_wrap(full_alignment) return noSaturation final_aligned_clusters = [] with tempfile.NamedTemporaryFile() as consensus_alignment: SeqIO.write(consensus_list, consensus_alignment.name, "fasta") file = consensus_alignment.name in_file = consensus_alignment.name mafft_cline = MafftCommandline(input=in_file) stdout, stderr = mafft_cline() handle = open(file, "w") handle.write(stdout) handle.close() path = consensus_alignment.name data = open(path).read() with tempfile.NamedTemporaryFile() as clusters_file: records = ( rec.upper() for rec in SeqIO.parse(consensus_alignment.name, "fasta")) SeqIO.write(records, clusters_file.name, "fasta") aligned_list = AlignIO.read(open(clusters_file.name), 'fasta') return aligned_list
def grande_alignment(file): raw_seqs = starting_pt(file) before_segment = clusters_alignment(file) list_of_clusters = hawk_wrap(file) print("Clusters formed:") print(len(list_of_clusters)) print(list_of_clusters) seqs_in_consensus = [] allEqualSeqs = [] position = 0 if type(list_of_clusters[0]) == list: big_final_alignment = [] while list_of_clusters: current_cluster = list_of_clusters.pop(0) if len(current_cluster) == 1: single_seq = before_segment[position] big_final_alignment.append(single_seq) position += 1 elif len(current_cluster) > 1: seqs_in_the_cluster = [raw_seqs[x] for x in current_cluster] multiple_seq = before_segment[position] while seqs_in_the_cluster: seqs_in_consensus.append(seqs_in_the_cluster.pop(0)) with tempfile.NamedTemporaryFile() as segment_align: SeqIO.write(seqs_in_consensus, segment_align.name, "fasta") original_seqs = list( SeqIO.parse(segment_align.name, "fasta")) with tempfile.NamedTemporaryFile() as segmenter: dash = '-' dashes = [] consensus = multiple_seq.seq seq = original_seqs for n in range(len(seq)): seq_str = seq[n].seq seq_id = seq[n].id able_to_insert_seq = seq_str.tomutable() for x in consensus: if x == dash: dashes = [ y for y, x in enumerate(consensus) if x == dash ] while dashes: dash_position = dashes.pop(0) able_to_insert_seq.insert(dash_position, '-') new_seq_record = SeqRecord(Seq( str(able_to_insert_seq), SingleLetterAlphabet()), id=seq_id) big_final_alignment.append(new_seq_record) position += 1 with tempfile.NamedTemporaryFile() as unequalSeqs: dash = '-' SeqIO.write(big_final_alignment, unequalSeqs.name, "fasta") total = list(SeqIO.parse(unequalSeqs.name, "fasta")) largestSeq = len( max([total[ind].seq for ind in range(len(total))], key=len)) while total: checkSeq = total.pop(0) if len(checkSeq.seq) < largestSeq: smallerLength = len(checkSeq.seq) seqStr = checkSeq.seq seqId = checkSeq.id needsEndingFilled = seqStr.tomutable() endingDashes = list( range(smallerLength + 1, largestSeq + 1)) while endingDashes: j = endingDashes.pop(0) needsEndingFilled.insert(j, '-') nowEqual = SeqRecord(Seq(str(needsEndingFilled), SingleLetterAlphabet()), id=seqId) allEqualSeqs.append(nowEqual) elif len(checkSeq.seq) == largestSeq: allEqualSeqs.append(checkSeq) elif type(list_of_clusters[0]) == int: allEqualSeqs = before_segment for seqs in range(len(allEqualSeqs)): print(">" + allEqualSeqs[seqs].id) print(allEqualSeqs[seqs].seq)
def mask_fna_with_spacers(INTERMEDIATES: str, FNA_FILE: str, SELECT_SPACER_FASTA: str, MASKED_FNA: str = 'masked.fna'): from subprocess import call as execute MASKED_ORGANISM_DB = INTERMEDIATES + 'masked_db/' # file to store masked FNA file execute(['mkdir', MASKED_ORGANISM_DB]) MASKED_FNA = MASKED_ORGANISM_DB + MASKED_FNA NEIGHBORING_NUCLEOTIDES = 500 # number of BP (+/-) to also mask around the array(s) arrayStartEndList = get_start_end_list( SELECT_SPACER_FASTA) # list containing the start and end of all arrays sequence_dict = fna_to_dict( FNA_FILE ) # header along with its start/end and the full sequence of the organism crisprName_maps_seqName = dict() for key in sequence_dict.keys(): nc_name = key.split('.') if len(nc_name) > 1: crisprName_maps_seqName[''.join(nc_name[0:-1])] = key else: crisprName_maps_seqName[nc_name[0]] = key # crisprName_maps_seqName = {''.join(key.split('.')[0:-1]):key for key in sequence_dict.keys()} ''' Masks CRISPR spacers from the organisms FNA file. ''' for arrayStartEnd in arrayStartEndList: start = int(arrayStartEnd[0]) - 1 end = int(arrayStartEnd[1]) crispr_name = arrayStartEnd[2] # fetch the nc # of the array nc_id = crisprName_maps_seqName[crispr_name] start, end = start - NEIGHBORING_NUCLEOTIDES, end + NEIGHBORING_NUCLEOTIDES sequence = str(sequence_dict[nc_id].seq ) # obtain the sequence: str from the sequence dict if start < 0: # corrects neg number start = 0 if end > len( sequence ): # corrects if end happens to be bigger than the length of the sequence end = len(sequence) blank = 'N' blank = blank * ( (end - start) ) # multiplies masking to cover the array plus the neighboring BP's sequence = sequence[0:start] + blank + sequence[end:len( sequence)] # masks sequences sequence_dict[nc_id].seq = Seq(sequence, SingleLetterAlphabet()) from sys import path as sys_path sys_path.append('dependencies/PyGornism/') # from regex import string_with_limited_width ''' Writes new FNA file with masking to disk. ''' with open(MASKED_FNA, 'w') as handle: for seq_record in sequence_dict: SeqIO.write(sequence_dict[seq_record], handle, 'fasta') return MASKED_FNA, crisprName_maps_seqName