def _get_dna_records(self, records, name): """ :param records: :return: """ og_cdna = [None] * len(records) for i, record in enumerate(records): if 'h5' in self._db_source: oma_db_nr = self._db_id_map.omaid_to_entry_nr(record.id) og_cdna[i] = SeqRecord.SeqRecord(Seq.Seq(self._db.get_cdna(oma_db_nr).decode("utf-8")), id=record.id + "_" + name, description="") elif 'fa' in self._db_source: og_cdna[i] = self._db[record.id] elif 'REST_api' in self._db_source: protein = requests.get(API_URL + "/protein/" + record.id + "/") protein = protein.json() og_cdna[i] = SeqRecord.SeqRecord(Seq.Seq(protein['cdna']), id=record.id + "_" + name, description="") if 'X' in str(og_cdna[i].seq): cleaned_seq = self._clean_DNA_seq(og_cdna[i]) og_cdna[i].seq = cleaned_seq return og_cdna
def extract_seq(fasta_file, fasta_header_name, start, stop, id="16S", header="", rev=False): from Bio import SeqRecord from Bio.Seq import Seq from Bio.Alphabet import IUPAC for one_fasta_entry in SeqIO.parse(fasta_file, "fasta"): if fasta_header_name == one_fasta_entry.name: seq = one_fasta_entry[int(start):int(stop)].seq seq.id = id seq.name = fasta_header_name #print seq record = SeqRecord.SeqRecord(seq, id="rrna", name="extract", description=header) record.id = id record.name = "baba" if not rev: return record else: record = SeqRecord.SeqRecord(seq.reverse_complement(), id=id, name="extract", description=header) return record
def aln_parse(aln_filename, fastq_filename): """ Convert an ART alignment output to a BioPython one """ reads = [] aligns = [] start_positions = [] fastqs = SeqIO.parse(open(fastq_filename), "fastq") with open(aln_filename, 'r') as fp: # Skip the first two lines - no info we want there line = fp.readline() line = fp.readline() line = fp.readline() # Deal with headers and get sequence length header_array = str.split(line, '\t') if str.rstrip(header_array[0]) != '@SQ': print "ART file header corrupted: ", header_array[0] sys.exit(2) header_array = str.split(line, '\t') sequence_length = int(header_array[2]) while line: # Find the next record while not line.startswith(">"): line = str.rstrip(fp.readline()) name_array = str.split(line, '\t') strand = str.rstrip(name_array[3]) if strand == '-': continue name = name_array[1] start_pos = int(name_array[2]) read = fastqs.next() clean_seq = Seq.Seq(str.rstrip(fp.readline()), generic_dna) dirty_seq = Seq.Seq(str.rstrip(fp.readline()), generic_dna) align = [ SeqRecord.SeqRecord(clean_seq, id=read.name, name=read.name, description=""), SeqRecord.SeqRecord(dirty_seq, id=read.name, name=read.name, description="")] reads.append(read) aligns.append(align) start_positions.append(start_pos) line = fp.readline() return reads, aligns, start_positions
def align(sequences): """Translate, then align, then back-translate the sequences.""" # First, start a new tempfile translated = tempfile.NamedTemporaryFile(prefix='Translated_', suffix='.fasta', mode='w+t') # And write the translated sequences into it nuc_seqs = {} for seq in SeqIO.parse(sequences, 'fasta'): s = SeqRecord.SeqRecord(seq.seq.translate(), id=seq.id, description='') SeqIO.write(s, translated, 'fasta') nuc_seqs[s.id] = str(seq.seq) # Seek to the beginning to read them with Clustal Omega translated.seek(0) # Open a temp file for clustal output aligned = tempfile.NamedTemporaryFile(prefix='Aligned_', suffix='.fasta', mode='w+t') # And align them co_cmd = ClustalOmegaCommandline(infile=translated.name, outfile=aligned.name, seqtype='protein', force=True, iterations=10, distmat_full=True, distmat_full_iter=True) co_cmd() # Close the translated unaligned handle. We are done with it translated.close() # Then, we want to back-translate the sequences backtrans = tempfile.NamedTemporaryFile(prefix='Backtranslated_', suffix='.fasta', mode='w+t') aligned.seek(0) aln = SeqIO.parse(aligned.name, 'fasta') for prot_seq in aln: bt = '' codon = 0 nuc = nuc_seqs[prot_seq.id] for aa in prot_seq: if aa == '-': bt += '---' else: bt += nuc[codon * 3:(codon * 3) + 3] codon += 1 # Make it a SeqRecord to write into the output file b = SeqRecord.SeqRecord(Seq.Seq(bt), id=prot_seq.id, description='') SeqIO.write(b, backtrans, 'fasta') # Again, seek to the beginning so we can read it later backtrans.seek(0) # Close the aligned and unaligned handles; we are done with them aligned.close() sequences.close() return backtrans
def cut_sequence(self, ref_seq): logger.info('cutting sequence {}'.format(self.id)) raw_blast_list = self.blasts_dict[ref_seq.id] blast_list = [] for blast in raw_blast_list: if blast.similarity > similarity_threshold: blast_list.append(blast) blast_list.sort(key=lambda match: match.ref_start) max_diff = ref_seq.length * 0.01 max_diff_cand = None for i in range(0, len(blast_list) - 1): actual_blast = blast_list[i] next_blast = blast_list[i + 1] start_next_blast = next_blast.ref_start end_actual_blast = actual_blast.ref_end diff = abs(start_next_blast - end_actual_blast) if diff > max_diff + ref_seq.length * 0.01: max_diff = diff max_diff_cand = i if max_diff_cand is None: return None #A gap has been found new_dict = {} new_dict[ref_seq.id] = [] cut_dtce = 0 for j in range(0, max_diff_cand + 1): if cut_dtce < blast_list[j].query_end < self.length: cut_dtce = blast_list[j].query_end new_dict[ref_seq.id].append(blast_list[j]) if cut_dtce == 0: return [self, None] new_seq_1 = Sequence(SeqRecord.SeqRecord(self.seq[:cut_dtce], id=self.id + '_1'), ref_seq=self.ref_seq) new_seq_2 = Sequence(SeqRecord.SeqRecord(self.seq[cut_dtce:], id=self.id + '_2'), ref_seq=self.ref_seq) new_seq_1.ref_start = blast_list[0].ref_start new_seq_1.ref_end = blast_list[max_diff_cand].ref_end new_seq_1.blasts_dict = new_dict other_dict = {} other_dict[ref_seq.id] = [] new_seq_2.ref_start = blast_list[max_diff_cand + 1].ref_start for j in range(max_diff_cand + 1, len(blast_list)): other_dict[ref_seq.id].append(blast_list[j]) new_seq_2.ref_end = blast_list[-1].ref_end new_seq_2.blasts_dict = other_dict logger.info( '2 new sequences: {} ({} - {}) w/ length {} and {} ({} - {}) w/ length {}' .format(new_seq_1.id, new_seq_1.ref_start, new_seq_1.ref_end, new_seq_1.length, new_seq_2.id, new_seq_2.ref_start, new_seq_2.ref_end, new_seq_2.length)) return [new_seq_1, new_seq_2]
def get_blast_matched_ids(query, blast_db_name, word_size='5', max_seq='6000', evalue=10.0, search_id='', threads=None, identity_cutoff='0'): query_file = settings.BLAST_TMP_DIR + search_id + '_query.fasta' result_file = settings.BLAST_TMP_DIR + search_id + '_blast_result.txt' with open(query_file, "w") as output_handle: my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(query), id='query', description='') SeqIO.write([my_rec], output_handle, 'fasta') if len(query) <= 15: task = 'blastn-short' else: task = 'blastn' if not threads: threads = settings.CORES matched_ids = run_blast_search(query_file, blast_db_name, result_file, threads, word_size, max_seq, evalue, task, identity_cutoff) os.remove(result_file) if os.path.exists(result_file) else None os.remove(query_file) if os.path.exists(query_file) else None return matched_ids
def ammend_fasta(): with open(fname_unspliced, 'w') as ofile: strain_by_protein = defaultdict(dict) for seq in SeqIO.parse(fname, 'fasta'): SeqIO.write(seq, ofile, 'fasta') prot = seq.name.split('_')[0] seq_name = seq.description.split('|')[1] strain_by_protein[prot][seq_name] = seq joined_seqs = defaultdict(dict) splice_pairs = [('M', 'M1', 'M2'), ('NS', 'NS1', 'NS2')] #splice_pairs = [('M', 'M1', 'BM2')] from seqanpy import align_overlap for c, a,b in splice_pairs: for strain in strain_by_protein[a]: seq1 = strain_by_protein[a][strain] new_id = c+seq1.id.lstrip(a) new_description = c+seq1.description.lstrip(a) new_description = new_description.replace(a, c) new_name = c+seq1.name.lstrip(a) new_name = new_name.replace(a, c) try: score, ali1, ali2 = align_overlap(seq1.seq, strain_by_protein[b][strain].seq, score_gapopen=-20, score_gapext=0) ali_array = np.array([np.fromstring(x, 'S1') for x in [ali1, ali2]]) tmpseq = np.copy(ali_array[0]) tmpseq[ali_array[0]=='-']=ali_array[1][ali_array[0]=='-'] joined_seqs[c][strain] = SeqRecord.SeqRecord(seq=Seq.Seq("".join(tmpseq)), id=new_id, description=new_description, name=new_name) SeqIO.write(joined_seqs[c][strain], ofile, 'fasta') except: print(seq1.name, "doesn't have a partner")
def cctmr_fasta2ref_fasta(fsta_fh, cctmr): """ Converts concatamer sequence to monomer fasta. :param fsta_fh: path to fasta file :param cctmr: seqeunce of individual monoer sequence """ from dms2dfe.lib.convert_seq import cds2aas from Bio import SeqIO, Seq, SeqRecord from Bio.Alphabet import IUPAC fsta_cctmr1_fh = "%s_cctmr1.fasta" % (splitext(fsta_fh)[0]) with open(fsta_fh, 'r') as fsta_data: #print [i for i in SeqIO.parse(fsta_data, "fasta")] for fsta_record in SeqIO.parse(fsta_data, "fasta"): fsta_id = fsta_record.id #print fsta_id fsta_seq = str(fsta_record.seq) fsta_cctmr1_seq = fsta_seq[(cctmr[0] - 1) * 3:(cctmr[1] - 1) * 3] break fsta_cctmr1_f = open(fsta_cctmr1_fh, "w") fsta_data = SeqRecord.SeqRecord(Seq.Seq(fsta_cctmr1_seq, IUPAC.ExtendedIUPACDNA), id=fsta_id, description='') SeqIO.write(fsta_data, fsta_cctmr1_f, "fasta") fsta_cctmr1_f.close() return fsta_cctmr1_fh
def setUp(self): self.bed_row = "\t".join("TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1 0 3539 TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1|m.13 0 + 2 2969 0 1 3539 0".split()) self.sequence = """ATCGAGCAGATTGGCCGCAACCTACAACTCCCACGGCCCAAGCACTCTCTCTCTCTCTTTCCCTCTCACC CTCGCCTCCGCTCCCCCATTTCCGAAGTACTCGCGAGCCAGCGGCCTCCAGCTCACCACCGTTTCCGCCG CGCGCAGATCCGCCCAATCCGTGCAGCCTCAGGCCACCGCTCTGGTTCCGTGACATGTGGCGAGGTGGTG GCGCAGACGCTGATGCAGGAGGCGCTCGCGAGGCTGAGGAGCACAACAATGTCGAGGAAGAGGAAGGGAG TGAGGATGGAGATCGGGACCTGCAGAATAAACGTCCTAAAGTGGGTGCTTTTGGCGAAGAAAGCTCTGGT GTTAATGCATCCTTCTTTGGATATGAAGCACCACATTTGCATGCTTTTGCTGAACATGACCATTTGAAGC TGTCACATGGTCCAGAAAATGAATTGGATTTTGGTTTGTCGCTTATCTCAAATGATGGTGGGAATGATAT TCCAAGGGAGACCAACAGTCATGGTGTCTGTGATGTAGAAAGATCAGGTGGAACAAATGCAGAAGATCTT GAAATAAGAATGGACCTATCTGATGATCTCTTGCACCTGATATTCTCCTTCTTATGCCAGAAGGATTTAT GTAGAGCAGGGGCTGCCTGCAAACAGTGGCAGTCTGCTAGTATGCATGAGGATTTCTGGAAATATTTGAA GTTTGAGAACACCAGAATATCTCTGCAGAACTTTGTTAATATTTGCCACCGTTATCAGAATGTGACAAAT CTCAATTTGTCTGGTGTCTTAAGTGCAGAAAGCCTAGTGATTGAAGCAATAACATTCTTAAGGCATCTTA AGACCTTGATAATGGGCAAGGGACAACTGGGAGAAACATTTTTTCAGGCTTTGGCTGAATGCCCATTGTT AAATACTTTAACAGTCAGTGATGCATCCCTTGGTAGTGGCATTCAAGAGGTAACTGTTAATCATGATGGA TTGCATGAACTTCAAATTGTGAAGTGTCGTGCACTCAGAGTATCTATCAGATGCCACCAACTTCGAATAC TGTCTCTGAGGAGAACTGGCATGGCTCATGTATCACTCAATTGTCCTCAGTTGCTTGAATTGGATTTTCA GTCCTGCCATAAGCTTTCTGACACTGCAATTCGTCAAGCAGCGACAGCCTGTCCACTGTTAGCGTCACTA GATATGTCATCCTGCTCGTGTGTTACTGATGAGACATTGCGTGAGATAGCTAATGCATGTCAAAATCTTT CTGTTCTTGATGCATCTAACTGCCCCAACATTTCTTTCGAGTCGGTAAAGCTTCCAATGTTGGTAGACTT GAGACTATCAAGTTGTGAGGGAATCACATCTGCTTCAATGGGTGCAGTATGTTTTAGTCGTATACTTGAG GCGTTGCAACTTGATAATTGTAGCCTGTTGACATCTGTGTCTTTGGATCTGCCACATCTCAAGAATATTA GTCTTGTACACCTCCGCAAGTTTGCTGATTTAAATCTGCGAAGCCCTGTGCTTTCTTACATAAAAGTTTC CAGATGCTCAGCACTTCGTTGTGTTACCATAACATCAAATGCTCTTAAGAAACTGGTGCTTCAAAAACAA GAGAGCCTATGTAATTTATCATTGCAATGCCACAATTTAATTGATGTTGATCTTAGTGATTGCGAGTCAT TGACAAATGAGATCTGCAAAGTTCTCAGTGACGGAGGGGGTTGCCCCATGCTCAGGTCATTAATTCTTGA TAATTGTGAGAGTTTGAGTGTCGTGGAACTGAATAATAGTTCTTTGGTTAATCTCTCACTTGCTGGTTGC CGTTCCATGACATTCCTGAAACTTGCATGCCCAAAGCTTCAAGTGGTGATTCTTGATGGTTGTGATCATC TTGAAAGAGCATCATTTTGCCCGGTTGGTCTTGAATCCCTAAACCTTGGAATTTGTCCAAAGTTGAGTGT TCTACGCATAGAGGCCCCAAATATGTCTATATTGGAGCTGAAGGGCTGTGGTGTCCTTTCTGAGGCTTCA ATTAATTGTCCTTGCTTGATATCTTTAGATGCCTCTTTCTGCAGACAGTTTATGGATGATTCGCTGTCCC AAACAGCAGAAGCATGCCCTCTTATTGAACATCTTATATTGTCTTCATGTTTATCCATTGACGTCCGTGG ATTGTCTTCTCTGCATTGCCTTCAGAAGCTGGCCTTGCTTGACCTATCATATACATTTTTGATGAACTTG AAGCCGGTTTTTGACAGTTGTCTGCAGTTGAAGGTCTTGAAACTTTCAGCTTGCAAGTATCTCAGTGATT CATCTTTGGAACCACTCTACAGAGAGGGTGCTCTACCGATGCTCGTTGAGCTAGATCTGTCCTACTCGTC CATTGGGCAGACTGCAATAGAAGAGCTTCTCGCGTGCTGTACAAATTTGGTTAATGTGAACCTAAACGGA TGTACGAACTTGCATGAATTGGTATGTGGATCAGACTATTGCCGGTCCGGTGACATGCCAATTGATGCTT TCCCCCCTGATTCTGCACCAGACAAGACCAAAGAGATCAGGGAGAGTTCGGATTGTCAGCTTGAAGTTCT CAGTTGTACTGGCTGTCCAAATATTAAGAAAGTTGTTATTCCTTCAACGGCCAACTATCTGAATTTGTCT AAGATCAACCTTAATTTGTCTGCAAACTTGAAGGAAGTAGATTTGAAGTGCTCCAATCTTTACAATTTAA ATTTGAGCAATTGTAACTCACTGGAGATTCTGAAGCTTGATTGCCCAAGATTGGCTAACCTCCAACTTTT GGCATGCACAATGTTGCAAGAGGATGAACTGAAATCTGCACTATCCTTTTGCGGTGCATTGGAGATCCTC AATGTGCACTCTTGTCCACAAATAAACACGCTGGATTTTGGCAGGCTACAGGCTGTTTGCCCAACTCTTA AGCGCATCCAGAGCAGCCCCATCGCATAGTATGAAGGATTCTGGTCTTCTTAATGGACTCGAGTAAATAG TCCAGATTTGAAACAGAAAAGGCCATGTCGTACTCTTGTACATATGCAGCACCGCCAATATATTGTATGG CTGCATGTATTAGGGAGCCAGGGCTGACATGAAACCTGTTCTTCCAATCGATTTCTTGTGTTGAATCTAG TTGAAACATGGAAACCGCACTTCCTAGTTTGTATTTGCTTTTGAGGTGCAGTGATGGAGTAAGCAGATCT GTATTTATATGAATGAATAACCATCTTGTTTGGATCGTCGATGTTGTATGCTTCATTGATGACATGGGGT GCTAAGTTTGACTGAAATTACACCAGGTTCTATGGTTCTCTCATAAGGTGCAGTGATTCTGCGGTCTTTA TTAATCTGTCTCAACTGTGACGATGCAACTGAGACGTTTCCATCTGCCGGCTGCTGATGCTGTGAACTCT TGGTAAAAAACCTGGTGTACTTGATCCAAGAGCATTCGTTGGGTCACTTGTATCCTTGAAAATTGAGTAA CTAATAAATGCTGTTGTGTAAAAAAAAGGGGCTTTCTTT""" self.seq = SeqRecord.SeqRecord(Seq.Seq(self.sequence.replace("\n", "")), id="TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1") self.index = dict() self.index["TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1"] = self.seq
def convert_from_subtype_to_hxb2(working_dir, position, orientation, subtype): """ Convert a position number in HXB2 to the equivalent in another subtype. Args: working_dir: working folder in which to place temporary files position: hxb2 coordinate position to convert subtype: subtype position to convert to """ sequences = [subtype_sequence(subtype), HXB2()] if orientation == "reverse": sequences = [SeqRecord.SeqRecord(Seq.reverse_complement(s.seq), id = s.id, name = s.name) for s in sequences] alignment = wrappers.mafft(working_dir, sequences) hxb2_pos = 0 subtype_pos = 0 for i in range(len(alignment[0])): if subtype_pos == position: return hxb2_pos if alignment[0][i] != "-": subtype_pos += 1 if alignment[1][i] != "-": hxb2_pos += 1
def test_file_to_seqbias(self): seq_list = [ [ "test1", "IKAAESFLPSPVLRTDVMFLVPALKYNPLHRLLIQILGGHETMIQIGHAETATVKFEERLVERIFDKRAGTSSLILIQIDYDEIQIWPGYSILRLGMPEKDEIQIAIITEMKRGAPHIQIQILDFGPAISFKESWLDCVMGNCYNDIASEIKVRGSDLNKVGVRARKECGVATSPINAFINRLLSATYSVGVNFLAVIQISTGIDKVHTNYDKA" ], [ "test2", "TTNIISELRCTQTCGNAMDNWMGEVLDGTPAFHFGVHCGDTAGPASKRFLLVCLEFSLRGYDLLVRLLLIKDEDANDVHCNQKCSQCCQKCMAHLALGPVTCSSSFNVHYSPGIGALWACIQTCEIDYCIQPCKACVQSCEERSLKVIKADGITAKSFAPMPNGAVDPSTVEYMVKTLIVCLQTCYDENRTVRRFPEKAL" ], [ "test3", "YPSSALQGGSMSRFLSPTMLRVRASLGFLGINLLPWTLFVIAALPSKSDAQLSSTQPLSAMGMEFIRANTESEINFVDKIHYAYHNLVVDPRKVDSEIAKERCKLLKSIVQVGSVTFATVPGDSYIGISSRSLMFVSEKNTGRELGNKCSAEQDDSSDQKNSGTAECGKLYSYEQWESTREGVDIIRKKTAVTHSNRQIPSVADHPLFLADAHEG" ] ] path_in = "/Users/coltongarelli/SequenceAnalyzer/PAM/Tests/Resources/SequenceBiasIOTestFile" director = Director() director.file_in_path = path_in director.start_up() seq_record_list = [] for i in seq_list: seq_record_list.append(SeqRecord.SeqRecord(id=i[0], seq=i[1])) director.master_list = seq_record_list director.run_bias_analysis() for i in range(len(seq_list)): self.assertEqual(director.master_list[i].seq, seq_list[i][1]) self.assertEqual(director.master_list[i].id, seq_list[i][0])
def fetch(self, accessions: Iterable[str]) -> Iterator[SeqRecord.SeqRecord]: """Fetch genbank records. Args: accessions (Iterable[str]): An iterable object that have accession numbers. Yields: Iterator[SeqRecord.SeqRecord]: An SeqRecord iterator. """ for use_accs in utils.split_per_n(accessions, self.n_once): use_accs = tuple(use_accs) queue = deque() queue.append(use_accs) # ダウンロードができなければクエリを半分にする while queue: query = queue.popleft() try: for r in self._efetch(query): yield r except Exception: if n := len(query) > 1: queue.append(query[:n // 2]) queue.append(query[n // 2:]) else: for acc in query: print(f"Cannot fetch {acc}.") yield SeqRecord.SeqRecord(seq="", name=acc)
def get_aln(self, internal=False): """assemble a multiple sequence alignment from the evolved sequences. Optionally in clude internal sequences Parameters ---------- internal : bool, optional include sequences of internal nodes in the alignment Returns ------- Bio.Align.MultipleSeqAlignment multiple sequence alignment """ from Bio import SeqRecord, Seq from Bio.Align import MultipleSeqAlignment tmp = [] for n in self.tree.get_terminals(): if n.is_terminal() or internal: tmp.append( SeqRecord.SeqRecord( id=n.name, name=n.name, description='', seq=Seq.Seq(''.join( n.ancestral_sequence.astype('U'))))) return MultipleSeqAlignment(tmp)
def main(input, output, target): #get sequences from a file records = {} seqs = [] with open(input, 'r') as f: for record in SeqIO.parse(f, 'fasta'): records[record.seq] = record seqs.append(record.seq) if len(records) != len(seqs): sys.exit('records different length from seqs! report') nseq = len(seqs) from collections import defaultdict transmissibility = defaultdict(list) DM = calc_distance_matrix(seqs) for id in range(len(DM)): seq = seqs[id] h1 = DM[id] centralTemp = np.divide(sum(h1), nseq - 1, dtype=float) transmissibility[centralTemp].append(seq) outD = [] for transVal, subseqs in sorted(transmissibility.items(), reverse=True): for seq in subseqs: outD.append(SeqRecord.SeqRecord(seq, id='>seq_1')) print(len(outD), target) if len(outD) > target: with open(output, 'w') as f: SeqIO.write(outD, f, 'fasta') return (input, len(seqs), len(outD))
def bam_to_rec(in_file, make_unique_recs=False): """ Generator to convert BAM files into Biopython SeqRecords. """ from Bio import SeqIO, Seq, SeqRecord bam_file = pysam.Samfile(in_file, "rb") rec_num = 1 # Keep track of which read IDs have been outputted read_ids_outputted = {} for read in bam_file: seq = Seq.Seq(read.seq) if read.is_reverse: seq = seq.reverse_complement() read_name = read.qname if make_unique_recs: read_name = "%s_%d" % (read_name, rec_num) else: # If we're not asked to make the records unique, # then don't output the same read ID twice if read_name in read_ids_outputted: continue # Record that we've seen this read read_ids_outputted[read_name] = True rec = SeqRecord.SeqRecord(seq, read_name, "", "") rec_num += 1 yield rec
def construct_supermatrix(coregenome, alifins, supermatrixfout): supermatrix = {} genomes = list(set(coregenome.genome)) n_genomes = len(genomes) for genome in genomes: supermatrix[genome] = SeqRecord.SeqRecord(id = genome, seq = "", description = "") alifindict = {filename_from_path(alifin): alifin for alifin in alifins} n_fams_sc = 0 for orthogroup, rows in coregenome.groupby("orthogroup"): alifin = alifindict[orthogroup] sequencedict = {} for record in SeqIO.parse(alifin, "fasta"): alilen = len(record.seq) sequencedict[record.id] = record.seq rows = rows.drop_duplicates("genome", keep = False) rows = pd.merge(pd.DataFrame({"genome": genomes}), rows, how = "left") for ix, row in rows.iterrows(): sequence_to_add = sequencedict.get(row.gene, "-" * alilen) supermatrix[row.genome] = supermatrix[row.genome] + sequence_to_add with open(supermatrixfout, "a") as supermatrixhout: for genome in supermatrix: SeqIO.write(supermatrix[genome], supermatrixhout, "fasta")
def assemble_contigs(contig_list, reference, join_threshold): sol = [] raw_list = [] for contig in contig_list: contig.set_borders(reference) if contig.ref_start is None: sol.append(contig) else: raw_list.append(contig) contigs_by_start = sorted(raw_list, key=lambda contig: contig.ref_start) contigs_by_end = sorted(raw_list, key=lambda contig: contig.ref_end) if join_threshold < 0: logger.info('\tnegative join_threshold. Must join all the contigs') while len(contigs_by_end) > 0: actual_contig = contigs_by_end[0] name = actual_contig.id logger.info('\tLooking for contigs near {}'.format(name)) sequence = actual_contig.seq start = actual_contig.ref_start end = actual_contig.ref_end dict = actual_contig.blasts_dict contigs_by_start.remove(actual_contig) used_contigs = [] j = 1 for contig in contigs_by_start: if actual_contig.ref_seq == contig.ref_seq and ( join_threshold < 0 or end + join_threshold > contig.ref_start): name += contig.id sequence += contig.seq start = min(start, contig.ref_start) end = max(end, contig.ref_end) dict[reference] += contig.blasts_dict[reference] used_contigs.append(contig) if join_threshold >= 0: logger.info( 'contig {} is near enough: end + join_threshold > contig.ref_start --> {} + {} = {} > ' '{}'.format(contig.id, end, join_threshold, end + join_threshold, contig.ref_start)) j += 1 else: break for elem in used_contigs: contigs_by_start.remove(elem) contigs_by_end.remove(elem) new_contig = Sequence(SeqRecord.SeqRecord(sequence, id=name), ref_seq=actual_contig.ref_seq) new_contig.ref_start = start new_contig.ref_end = end new_contig.blasts_dict = dict sol.append(new_contig) contigs_by_end.remove(actual_contig) if j == 1: logger.info('No contigs near enough to join') else: logger.info( 'Search finished. {} contigs have been joined\n'.format(j)) logger.info( 'Finish assembling contigs with reference {}\n'.format(reference)) return sol
def swissrec2seqrec(record): seq = Seq.Seq(record.sequence, Seq.IUPAC.protein) s = SeqRecord.SeqRecord(seq, description=record.description, id=record.accessions[0], name=record.entry_name) return s
def make_alignment_upper_case(orig_aln): ''' Transform all characters to uppercase. :param orig_aln: Biopython alignment :type orig_aln: :class:`Bio.Align.MultipleSeqAlignment` :returns: Uppercase Biopython alignment, for :class:`Bio.Align.MultipleSeqAlignment`. :rtype: list(:class:`Bio.SeqRecord.SeqRecord`) Example:: >>> alignment = AlignIO.read(open('seq.txt', 'rU'), 'fasta') >>> print alignment SingleLetterAlphabet() alignment with 3 rows and 178 columns iigp--gr-gfgkrrhpkkltplaykqfipnvaekt...sgg 3M1N:A|PDBID|CHAIN|SEQUENCE iigp--grpgfgkrrhpkkltplaykqfipnvaekt...sgg PRO1:A|NAME1|CHAIN|SEQUENCE iigpxxgrcgfgkrrhpkkltplaykqfipnvaekt...sgg PRO2:A|NAME2|CHAIN|SEQUENCE >>> new_alignment = Sequence.make_alignment_upper_case(alignment) >>> print Align.MultipleSeqAlignment(new_alignment) Alphabet() alignment with 3 rows and 178 columns IIGP--GR-GFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG 3M1N:A|PDBID|CHAIN|SEQUENCE IIGP--GRPGFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG PRO1:A|NAME1|CHAIN|SEQUENCE IIGPXXGRCGFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG PRO2:A|NAME2|CHAIN|SEQUENCE ''' aln = [] for record in orig_aln: aln.append(SeqRecord.SeqRecord(Seq.Seq(record.seq.tostring().upper()), record.id, description=record.description)) return aln
def translate_dealign(aln): """De-align the aligned sequences (remove gaps), and translate them to amino acid. Write the de-aligned amino acid sequences to a temporary file, to align later.""" to_realign = [] for s in SeqIO.parse(open(aln), 'fasta'): nogaps = ''.join([b for b in s.seq if b != '-']) # Translate it nogaps_trans = str(Seq.Seq(nogaps).translate()) # Remove any stop codons nogaps_trans = nogaps_trans.split('*')[0] to_realign.append( SeqRecord.SeqRecord( Seq.Seq(nogaps_trans), id=s.id, name='', description='' ) ) # Open a temporary file t = tempfile.NamedTemporaryFile( mode='w+t', prefix='msaprobs_realign', suffix='.fasta' ) SeqIO.write(to_realign, t.name, 'fasta') return t
def test_internal(self): sequence = """TCCTCACAGTTACTATAAGCTCGTCTATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC CGTTGACTATCTCGCCTGA""" sequence = sub("\n", "", sequence) record = SeqRecord.SeqRecord(Seq.Seq(sequence), id="class_Chr1.1006.0") index = {record.id: record} line = "\t".join( ['class_Chr1.1006.0', '0', '619', 'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)', '0', '+', '2', '617', '0', '1', '619', '0']) bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index) self.assertFalse(bed_line.invalid, bed_line.invalid_reason) pep = sequence[bed_line.thick_start - 1 + 2:bed_line.thick_end] if len(pep) % 3 != 0: pep = pep[:-(len(pep) % 3)] pep = str(Seq.Seq(pep).translate()) self.assertEqual(bed_line.phase, 2, (bed_line.thick_start, bed_line.thick_end, pep)) self.assertFalse(bed_line.has_start_codon) self.assertFalse(bed_line.has_stop_codon) lines = """Chr1 CLASS transcript 3442811 3443785 1000 - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0"; Chr1 CLASS exon 3442811 3442999 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443099 3443169 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443252 3443329 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443417 3443493 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; Chr1 CLASS exon 3443582 3443785 . - . gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";""" lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _] transcript = Transcript(lines[0]) transcript.add_exons(lines[1:]) transcript.finalize() transcript.load_orfs([bed_line]) self.assertTrue(transcript.is_coding) self.assertFalse(transcript.has_start_codon) self.assertFalse(transcript.has_stop_codon) self.assertEqual(transcript.selected_cds_end, transcript.start) self.assertEqual(transcript.selected_cds_start, transcript.end)
def replace_indels(sam_filename_in, templ_filename, sam_filename_out): '''Replace indels, replacing them with wildtype.''' sam_filename_out = io_utils.get_filename(sam_filename_out) templ_seq = get_seq(templ_filename) records = [] all_reads = 0 for read in Samfile(sam_filename_in, 'r'): # Perform mapping of nucl indices to remove spurious indels: all_reads += 1 seq = ''.join([ read.seq[pair[0]] if pair[0] else templ_seq[pair[1]] for pair in read.aligned_pairs if pair[1] is not None ]) if seq: records.append( SeqRecord.SeqRecord(Seq.Seq(seq), read.qname, '', '')) reads_filename = io_utils.get_filename(None) with open(reads_filename, 'w') as fle: SeqIO.write(records, fle, 'fasta') utils.mem(templ_filename, reads_filename, out_filename=sam_filename_out, gap_open=12) print('%s: %i/%i passed replace_indels filter' % (sam_filename_in, len(records), all_reads)) return sam_filename_out
def backtranslate(amino_acid, nucleotide): """Use the nucleotide sequences to convert the aligned amino acids back to nucleotides.""" # Store the non-gapped nucleotide sequences as a dictionary nuc_seqs = [] for s in SeqIO.parse(nucleotide, 'fasta'): snogap = ''.join([b for b in s.seq if b != '-']) nuc_seqs.append( SeqRecord.SeqRecord( seq=Seq.Seq(snogap), id=s.id, name='', description='' ) ) nuc_seqs = SeqIO.to_dict(nuc_seqs) # Then, back-translate the aligned sequences bt_seq = [] for prot in SeqIO.parse(amino_acid, 'fasta'): codon = 0 bt = '' nuc = nuc_seqs[prot.id] for aa in prot: if aa == '-': bt += '---' else: bt += nuc[codon*3:(codon*3)+3] codon += 1 bt_seq.append(bt) # Write the backtranslated sequences to disk bt_name = amino_acid.replace('msaprobs.fasta', 'backtranslated.fasta') SeqIO.write(bt_seq, bt_name, 'fasta') return
def _predict_best_protein_pyopa(self, record, og): """ Given a list of sequences that are derived from mapped reads to multiple seq of a OG we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using pyopa local alignment and return the sequence with its highest score! :return: """ ref_og_seq = og.aa[0] s1 = pyopa.Sequence(str(ref_og_seq.seq)) best_score = 0 try: frames = [ record.seq[i:].translate(table='Standard', stop_symbol='X', to_stop=False, cds=False) for i in range(3) ] best_seq_idx = 0 for i, seq in enumerate(frames): s2 = pyopa.Sequence(str(seq)) # calculating local and global scores for the given sequences local_double = pyopa.align_double(s1, s2, self.env) # print('Local score: %f' % local_double[0]) if local_double[0] > best_score: best_score = local_double[0] best_seq_idx = i best_translation = SeqRecord.SeqRecord( frames[best_seq_idx], id=self._species_name, description=record.description, name=record.name) except: raise ValueError("Problem with sequence format!", ref_og_seq.seq) return best_translation
def start_record(self): seq = Seq.Seq("", self.alphabet) self.data = SeqRecord.SeqRecord(seq) self.data.description = "" self.data.name = "" self._current_ref = None self._sequence_lines = []
def create_reference_region_with_specific_repeats(reference_vntr, desired_repeats_count, output_name, flanks=30000, repeat_patterns=None): record = SeqRecord.SeqRecord('') sequence = get_chromosome_reference_sequence(reference_vntr.chromosome) vntr_end = reference_vntr.start_point + reference_vntr.get_length() if flanks is None: region_start = 0 region_end = len(sequence) else: region_start = reference_vntr.start_point - flanks region_end = vntr_end + flanks new_sequence = sequence[region_start:reference_vntr.start_point] if repeat_patterns is None: repeats = reference_vntr.get_repeat_segments() else: repeats = repeat_patterns for i in range(desired_repeats_count): new_sequence += repeats[i % len(repeats)] new_sequence += sequence[vntr_end:region_end] record.seq = Seq.Seq(new_sequence) with open(output_name, 'w') as output_handle: SeqIO.write([record], output_handle, 'fasta')
def fasta_nts2prt(fsta_fh, host='coli', fsta_prt_fh=None): """ Translates nucleotide fasta to amino acid fasta :param fsta_fh: path to fasta file :param host: host organism e.g. E. coli :param fsta_prt_fh: path to fasta protein sequence :returns fsta_seq_prt: fasta protein sequence """ from dms2dfe.lib.convert_seq import cds2aas from Bio import SeqIO, Seq, SeqRecord from Bio.Alphabet import IUPAC with open(fsta_fh, 'r') as fsta_data: for fsta_record in SeqIO.parse(fsta_data, "fasta"): fsta_id = fsta_record.id fsta_seq = str(fsta_record.seq) break if fsta_prt_fh == None: fsta_prt_fh = "%s_prt%s" % (splitext(fsta_fh)[0], splitext(fsta_fh)[1]) fsta_prt_f = open(fsta_prt_fh, "w") fsta_seq_prt = cds2aas(fsta_seq, host, stop_codon='*') fsta_seq_prt_id = splitext(basename(fsta_fh))[0] + '_prt' # print fsta_seq_prt # print fsta_seq_prt_id fsta_data_prt = SeqRecord.SeqRecord(Seq.Seq(fsta_seq_prt, IUPAC.protein), id=fsta_seq_prt_id, description='') SeqIO.write(fsta_data_prt, fsta_prt_f, "fasta") fsta_prt_f.close() return fsta_seq_prt
def identify_similar_regions_for_vntrs_using_blat(): from multiprocessing import Process, Semaphore, Manager reference_vntrs = load_unique_vntrs_data() records = [] for ref_vntr in reference_vntrs: record = SeqRecord.SeqRecord('') sequence = ref_vntr.left_flanking_region[ -30:] + ref_vntr.pattern + ref_vntr.right_flanking_region[:30] record.seq = Seq.Seq(sequence) record.id = str(ref_vntr.id) records.append(record) vntr_structures_file = 'reference_vntr_structures.fa' with open(vntr_structures_file, 'w') as output_handle: SeqIO.write(records, output_handle, 'fasta') sema = Semaphore(7) manager = Manager() result_list = manager.list() process_list = [] for ref_vntr in reference_vntrs: sema.acquire() p = Process(target=find_similar_region_for_vntr, args=(sema, ref_vntr, vntr_structures_file, result_list)) process_list.append(p) p.start() for p in process_list: p.join() result_list = list(result_list) with open('similar_vntrs.txt', 'a') as out: for vntr_id in result_list: out.write('%s\n' % vntr_id)
def split_hypermuts(self, hm_columns): """Produce the hypermut positive and hypermut negative alignments""" hm_indices = list(set(map(lambda n: n - 1, hm_columns))) hm_indices.sort() # soi is either a seq or index - handle appropriately def hyp_reducer(soi, i): seq1 = self[:, soi:soi + 1] if type(soi) == int else soi seq2 = self[:, i:i + 1] return seq1 + seq2 init = type(self)([ SeqRecord.SeqRecord(Seq.Seq(''), id=self[i].id) for i in xrange(len(self)) ]) self.hm_pos_aln = reduce(hyp_reducer, hm_indices, init) if hm_indices: self.hm_neg_aln = self[:, :hm_indices[0]] n_hypermut = len(hm_indices) for i in range(0, n_hypermut - 1): start_i = hm_indices[i] + 1 stop_i = hm_indices[i + 1] self.hm_neg_aln += self[:, start_i:stop_i] self.hm_neg_aln += self[:, hm_indices[-1] + 1:] else: self.hm_neg_aln = self return self
def find_similar_region_for_vntr(sema, reference_vntr, ref_file, result_list): from Bio import SearchIO vntr_id = reference_vntr.id q = reference_vntr.left_flanking_region[ -30:] + reference_vntr.pattern + reference_vntr.right_flanking_region[: 30] search_index = vntr_id qfile = settings.BLAST_TMP_DIR + str(vntr_id) + '_' + str( search_index) + '_query.fasta' with open(qfile, "w") as output_handle: my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(q), id='query', description='') SeqIO.write([my_rec], output_handle, 'fasta') output = 'blat_out/output_%s_%s.psl' % (vntr_id, search_index) command = 'blat -q=dna -oneOff=1 -tileSize=8 -stepSize=3 -minIdentity=75 %s %s %s' % ( ref_file, qfile, output) os.system(command) os.system('rm %s' % qfile) try: qresult = SearchIO.read(output, 'blat-psl') if is_false_vntr_hit(qresult, reference_vntr): print('there is similar sequence for %s' % vntr_id) result_list.append(vntr_id) except ValueError: pass sema.release()
def translate_cds(record, table): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_protein # Translation tables # http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi # Extract CDS feature_list = list() for f in record.features: if f.type.lower() == 'cds': feature_list.append(f) extraction_list = list() for f in feature_list: e = f.extract(record) start = int(f.qualifiers['codon_start'][0]) if start > 1: e = e[start-1:len(e)] extraction_list.append(e) translation_list = list() for e in extraction_list: t = e.seq.translate(table=table) translation_list.append(t) seq = '' for t in translation_list: seq = seq + str(t) seq = Seq(seq, generic_protein) rec = SeqRecord(seq) rec.name = record.id.split('.')[0] rec.description = record.description rec.annotations['gi'] = record.annotations['gi'] rec.annotations['organism'] = record.annotations['organism'] rec.annotations['taxonomy'] = record.annotations['taxonomy'] rec.id = record.id return rec