def blastp(self, seq1, seq2, evalue=10e6): query_pth = os.path.join(tmp_dir, "%s.seq" % seq1.uid) subject_pth = os.path.join(tmp_dir, "%s.seq" % seq2.uid) output_pth = os.path.join(tmp_dir, "%s_%s.out" % (seq1.uid, seq2.uid)) SeqIO.write(SeqRecord(BioSeq(seq1.seq), seq1.uid), open(query_pth, 'w+'), "fasta") SeqIO.write(SeqRecord(BioSeq(seq2.seq), seq2.uid), open(subject_pth, 'w+'), "fasta") cline = "blastp -query %s -subject %s -outfmt 6 -out %s -evalue %s 1>/dev/null 2>&1" \ % (query_pth, subject_pth, output_pth, evalue) if os.WEXITSTATUS(os.system(cline)) != 0: print("BLAST failed unexpectedly (%s, %s)" % (seq1.uid, seq2.uid)) return HSP([ seq1.uid, seq2.uid, 0., 0., 0., 0., 0., 0., 0., 0., evalue * 10, 10e-6 ]) assert os.path.exists(output_pth) with open(output_pth, 'r') as f: hits = [HSP(line.split('\t')) for line in f.readlines()] if len(hits) == 0: return HSP([ seq1.uid, seq2.uid, 0., 0., 0., 0., 0., 0., 0., 0., evalue * 10, 10e-6 ]) hsp = hits[np.argmin([h.evalue for h in hits])] try: self.collection.update_one({"_id": hsp.uid}, {"$set": vars(hsp)}, upsert=True) except DuplicateKeyError: pass return hsp
def prepare_blast(sequences): timestamp = datetime.date.today().strftime("%m-%d-%Y") blastdb_pth = os.path.join(tmp_dir, 'blast-%s' % timestamp) records = [SeqRecord(BioSeq(seq), uid) for uid, seq in sequences.items()] SeqIO.write(records, open(blastdb_pth, 'w+'), "fasta") os.system("makeblastdb -in %s -dbtype prot" % blastdb_pth) return blastdb_pth
def compute_blast_parallel(uid2seq, db_pth, collection, evalue): pbar = tqdm(range(len(uid2seq)), desc="sequences processed") inputs = [SeqRecord(BioSeq(seq), uid) for uid, seq in uid2seq.items()] for i, (seq, hits) in enumerate(E.map(parallel_blast(db_pth, evalue), inputs)): for hsp in hits: collection.update_one({"_id": hsp.uid}, {"$set": vars(hsp)}, upsert=True) pbar.update(1) pbar.close()
def Sequences(self): ''' It tries to manage the storage of multiple sequences inside the seqrecord. It is able to manage 2 types of multiple sequence: - based on annotation-key: if in annotation key there is a key named multiple-sequences, and the key is a list of string, one for each sequence, where each string is of the type "label:start:end". - "label" that is the label of the sequence used also for the structure section (where the label is the chain) - "start" and "end" that are respectively the start and end position of each sequence inside the global sequence. - based on separator (a separator inside the sequence is used to separate the different sequences). It will use one of these three separators: "#", ",", ";" It returns a list of tuple, where each tuple is a 'label' and a seq object (Bio.Seq.Seq instance). ''' ##start to split on multiple-sequence annotation key seq = self.seqrecord.seq seq_list = [] if self.annotations.has_key('multiple-sequences'): for label_start_end in self.annotations['multiple-sequences']: label, start, end = label_start_end.split(':') subseq = BioSeq(seq.data[int(start):int(end)], alphabet=seq.alphabet) seq_list.append((label, subseq)) else: listsep = ['#', ',', ';'] separator = '' for sep in listsep: if seq.data.rfind(sep): separator = sep if separator: seqs = seq.data.split(separator) for seqdata in seqs: ## if separator is multiple... exclude '' objects if seqdata: subseq = BioSeq(seqdata, alphabet=seq.alphabet) seq_list.append(('', subseq)) ## if it didn't do any kind o split... return the whole seq if not seq_list: seq_list.append(('', seq)) return seq_list
def get_seqs_cdr_labeller(cdr_labeller_dir): from Bio.Seq import Seq as BioSeq from Bio.Alphabet import generic_dna import pandas as pd from collections import defaultdict df = pd.read_csv(cdr_labeller_dir + "/cdr_details.txt", sep="\t") id2muts = defaultdict(list) with open(cdr_labeller_dir + "/shm_details.txt") as f: next(f) # Skip header for line in f: splitted = line.split("\t") if line.startswith("Read_name"): gene = splitted[4].split(":")[1] id = ":".join(splitted[0].split(":")[1:]) else: position = splitted[1] target = splitted[3] id2muts[id].append({"loc": gene + position, "mut": target}) if args.is_aa: junc_query = 'junc_aa' else: junc_query = 'junc_nt' output = [] for index, row in df.iterrows(): coding_dna = BioSeq(row["CDR3_nucls"], generic_dna) junc_aa = str(coding_dna.translate()) id = row["Read_name"] r = {'seq_id': id, 'v_gene': {'full': row["V_hit"]}, 'j_gene': {'full': row["J_hit"]}, 'junc_aa': junc_aa, 'junc_nt': row['CDR3_nucls'], 'var_muts_nt': {'muts': id2muts[id]}} output.append(Seq(r, junc_query)) return output
def getSeqRecord(self): """ id seq - The sequence itself (Seq object) Additional attributes: name - Sequence name, e.g. gene name (string) description - Additional text (string) dbxrefs - List of database cross references (list of strings) features - Any (sub)features defined (list of SeqFeature objects) annotations - Further information about the whole sequence (dictionary) """ seqr = BioSeqRecord(id=self.Accession(), seq=BioSeq(self.Sequence(), self.alphabetClass()()), name=self.Name(), description=self.Description()) seqr.features = self.features seqr.annotations = self.annotations return seqr
def predict_blast_parallel(queries, seqid2go, db_pth, evalue): pbar = tqdm(range(len(queries)), desc="queries processed") inputs = [SeqRecord(BioSeq(tgt.seq), tgt.uid) for tgt in queries] query2hits = {} for i, (query, hits) in enumerate(E.map(parallel_blast(db_pth, evalue), inputs)): query2hits[query.id] = hits pbar.update(1) pbar.close() query2preds = {} pbar = tqdm(range(len(query2hits)), desc="queries processed") for i, (qid, hits) in enumerate(query2hits.items()): pbar.update(1) query2preds[qid] = {} if len(hits) == 0: continue for hsp in hits: for go in seqid2go[hsp.sseqid]: if go in query2preds[qid]: query2preds[qid][go].append(hsp) else: query2preds[qid][go] = [hsp] pbar.close() return query2preds
try: from Bio import SeqIO from Bio.Seq import Seq as BioSeq except: print "This program requires the Biopython library" sys.exit(0) try: fasta_file = sys.argv[1] # Input fasta file min_length = int(sys.argv[2]) # Minimum length of sequence result_file = sys.argv[3] # Output fasta file except: print __doc__ sys.exit(0) try: replace_by = sys.argv[4] # String to replace with except: replace_by = "A" print "No replace string entered, using 'A'" fasta_sequences = SeqIO.parse(open(fasta_file), 'fasta') with open(result_file, "w") as f: for seq in fasta_sequences: if len(str(seq.seq)) >= min_length: SeqIO.write([seq], f, "fasta") else: seq.seq = BioSeq(replace_by) SeqIO.write([seq], f, "fasta")
def to_fasta(self, out_file): sequences = [] for unipid, seq, annots in self: sequences.append(SeqRecord(BioSeq(seq), unipid)) SeqIO.write(sequences, open(out_file, 'w+'), "fasta")
def to_fasta(seq_map, out_file): sequences = [] for unipid, seq in seq_map.items(): sequences.append(SeqRecord(BioSeq(seq), unipid)) SeqIO.write(sequences, open(out_file, 'w+'), "fasta")