Exemple #1
0
 def blastp(self, seq1, seq2, evalue=10e6):
     query_pth = os.path.join(tmp_dir, "%s.seq" % seq1.uid)
     subject_pth = os.path.join(tmp_dir, "%s.seq" % seq2.uid)
     output_pth = os.path.join(tmp_dir, "%s_%s.out" % (seq1.uid, seq2.uid))
     SeqIO.write(SeqRecord(BioSeq(seq1.seq), seq1.uid),
                 open(query_pth, 'w+'), "fasta")
     SeqIO.write(SeqRecord(BioSeq(seq2.seq), seq2.uid),
                 open(subject_pth, 'w+'), "fasta")
     cline = "blastp -query %s -subject %s -outfmt 6 -out %s -evalue %s 1>/dev/null 2>&1" \
             % (query_pth, subject_pth, output_pth, evalue)
     if os.WEXITSTATUS(os.system(cline)) != 0:
         print("BLAST failed unexpectedly (%s, %s)" % (seq1.uid, seq2.uid))
         return HSP([
             seq1.uid, seq2.uid, 0., 0., 0., 0., 0., 0., 0., 0.,
             evalue * 10, 10e-6
         ])
     assert os.path.exists(output_pth)
     with open(output_pth, 'r') as f:
         hits = [HSP(line.split('\t')) for line in f.readlines()]
         if len(hits) == 0:
             return HSP([
                 seq1.uid, seq2.uid, 0., 0., 0., 0., 0., 0., 0., 0.,
                 evalue * 10, 10e-6
             ])
         hsp = hits[np.argmin([h.evalue for h in hits])]
         try:
             self.collection.update_one({"_id": hsp.uid},
                                        {"$set": vars(hsp)},
                                        upsert=True)
         except DuplicateKeyError:
             pass
     return hsp
Exemple #2
0
def prepare_blast(sequences):
    timestamp = datetime.date.today().strftime("%m-%d-%Y")
    blastdb_pth = os.path.join(tmp_dir, 'blast-%s' % timestamp)
    records = [SeqRecord(BioSeq(seq), uid) for uid, seq in sequences.items()]
    SeqIO.write(records, open(blastdb_pth, 'w+'), "fasta")
    os.system("makeblastdb -in %s -dbtype prot" % blastdb_pth)
    return blastdb_pth
Exemple #3
0
def compute_blast_parallel(uid2seq, db_pth, collection, evalue):
    pbar = tqdm(range(len(uid2seq)), desc="sequences processed")
    inputs = [SeqRecord(BioSeq(seq), uid) for uid, seq in uid2seq.items()]
    for i, (seq,
            hits) in enumerate(E.map(parallel_blast(db_pth, evalue), inputs)):
        for hsp in hits:
            collection.update_one({"_id": hsp.uid}, {"$set": vars(hsp)},
                                  upsert=True)
        pbar.update(1)
    pbar.close()
Exemple #4
0
 def Sequences(self):
     '''
     It tries to manage the storage of multiple sequences inside
     the seqrecord. It is able to manage 2 types of multiple sequence:
      - based on annotation-key: if in annotation key there is a key
        named multiple-sequences, and the key is a list of string, one for
        each sequence, where each string is of the type "label:start:end".
          - "label" that is the label of the sequence used also for
            the structure section (where the label is the chain)
          - "start" and "end" that are respectively the start and end
            position of each sequence inside the global sequence.
      - based on separator (a separator inside the sequence is used to
        separate the different sequences). It will use one of these
        three separators: "#", ",", ";"
     It returns a list of tuple, where each tuple is a 'label' and
     a seq object (Bio.Seq.Seq instance).
     '''
     ##start to split on multiple-sequence annotation key
     seq = self.seqrecord.seq
     seq_list = []
     if self.annotations.has_key('multiple-sequences'):
         for label_start_end in self.annotations['multiple-sequences']:
             label, start, end = label_start_end.split(':')
             subseq = BioSeq(seq.data[int(start):int(end)],
                             alphabet=seq.alphabet)
             seq_list.append((label, subseq))
     else:
         listsep = ['#', ',', ';']
         separator = ''
         for sep in listsep:
             if seq.data.rfind(sep): separator = sep
         if separator:
             seqs = seq.data.split(separator)
             for seqdata in seqs:
                 ## if separator is multiple... exclude '' objects
                 if seqdata:
                     subseq = BioSeq(seqdata, alphabet=seq.alphabet)
                     seq_list.append(('', subseq))
     ## if it didn't do any kind o split... return the whole seq
     if not seq_list: seq_list.append(('', seq))
     return seq_list
Exemple #5
0
def get_seqs_cdr_labeller(cdr_labeller_dir):
    from Bio.Seq import Seq as BioSeq
    from Bio.Alphabet import generic_dna

    import pandas as pd
    from collections import defaultdict

    df = pd.read_csv(cdr_labeller_dir + "/cdr_details.txt", sep="\t")

    id2muts = defaultdict(list)

    with open(cdr_labeller_dir + "/shm_details.txt") as f:
        next(f)  # Skip header
        for line in f:
            splitted = line.split("\t")
            if line.startswith("Read_name"):
                gene = splitted[4].split(":")[1]
                id = ":".join(splitted[0].split(":")[1:])
            else:
                position = splitted[1]
                target = splitted[3]
                id2muts[id].append({"loc": gene + position, "mut": target})

    if args.is_aa:
        junc_query = 'junc_aa'
    else:
        junc_query = 'junc_nt'

    output = []
    for index, row in df.iterrows():
        coding_dna = BioSeq(row["CDR3_nucls"], generic_dna)
        junc_aa = str(coding_dna.translate())
        id = row["Read_name"]
        r = {'seq_id': id,
             'v_gene': {'full': row["V_hit"]}, 'j_gene': {'full': row["J_hit"]},
             'junc_aa': junc_aa, 'junc_nt': row['CDR3_nucls'],
             'var_muts_nt': {'muts': id2muts[id]}}
        output.append(Seq(r, junc_query))
    return output
Exemple #6
0
 def getSeqRecord(self):
     """
         id
         seq         - The sequence itself (Seq object)
         Additional attributes:
         name        - Sequence name, e.g. gene name (string)
         description - Additional text (string)
         dbxrefs     - List of database cross references (list of
                       strings)
         features    - Any (sub)features defined (list of
                       SeqFeature objects)
         annotations - Further information about the whole
                       sequence (dictionary)
     """
     seqr = BioSeqRecord(id=self.Accession(),
                         seq=BioSeq(self.Sequence(),
                                    self.alphabetClass()()),
                         name=self.Name(),
                         description=self.Description())
     seqr.features = self.features
     seqr.annotations = self.annotations
     return seqr
Exemple #7
0
def predict_blast_parallel(queries, seqid2go, db_pth, evalue):
    pbar = tqdm(range(len(queries)), desc="queries processed")
    inputs = [SeqRecord(BioSeq(tgt.seq), tgt.uid) for tgt in queries]
    query2hits = {}
    for i, (query,
            hits) in enumerate(E.map(parallel_blast(db_pth, evalue), inputs)):
        query2hits[query.id] = hits
        pbar.update(1)
    pbar.close()
    query2preds = {}
    pbar = tqdm(range(len(query2hits)), desc="queries processed")
    for i, (qid, hits) in enumerate(query2hits.items()):
        pbar.update(1)
        query2preds[qid] = {}
        if len(hits) == 0:
            continue
        for hsp in hits:
            for go in seqid2go[hsp.sseqid]:
                if go in query2preds[qid]:
                    query2preds[qid][go].append(hsp)
                else:
                    query2preds[qid][go] = [hsp]
    pbar.close()
    return query2preds
Exemple #8
0
try:
    from Bio import SeqIO
    from Bio.Seq import Seq as BioSeq
except:
    print "This program requires the Biopython library"
    sys.exit(0)

try:
    fasta_file = sys.argv[1]  # Input fasta file
    min_length = int(sys.argv[2])  # Minimum length of sequence
    result_file = sys.argv[3]  # Output fasta file
except:
    print __doc__
    sys.exit(0)

try:
    replace_by = sys.argv[4]  # String to replace with
except:
    replace_by = "A"
    print "No replace string entered, using 'A'"

fasta_sequences = SeqIO.parse(open(fasta_file), 'fasta')

with open(result_file, "w") as f:
    for seq in fasta_sequences:
        if len(str(seq.seq)) >= min_length:
            SeqIO.write([seq], f, "fasta")
        else:
            seq.seq = BioSeq(replace_by)
            SeqIO.write([seq], f, "fasta")
Exemple #9
0
 def to_fasta(self, out_file):
     sequences = []
     for unipid, seq, annots in self:
         sequences.append(SeqRecord(BioSeq(seq), unipid))
     SeqIO.write(sequences, open(out_file, 'w+'), "fasta")
Exemple #10
0
 def to_fasta(seq_map, out_file):
     sequences = []
     for unipid, seq in seq_map.items():
         sequences.append(SeqRecord(BioSeq(seq), unipid))
     SeqIO.write(sequences, open(out_file, 'w+'), "fasta")