def _parse_qresult(self): """Parse query results (PRIVATE).""" for event, elem in self.xml_iter: if event == "end" and elem.tag == self.NS + "protein": # store the query sequence seq = elem.find(self.NS + "sequence") query_seq = seq.text # store the query id and description xref = elem.find(self.NS + "xref") query_id = xref.attrib["id"] query_desc = xref.attrib["name"] # parse each hit hit_list = [] for hit_new in self._parse_hit( elem.find(self.NS + "matches"), query_id, query_seq): # interproscan results contain duplicate hits rather than # a single hit with multiple hsps. In this case the hsps # of a duplicate hit will be appended to the already # existing hit for hit in hit_list: if hit.id == hit_new.id: for hsp in hit_new.hsps: hit.hsps.append(hsp) break else: hit_list.append(hit_new) # create qresult and assing attributes qresult = QueryResult(hit_list, query_id) setattr(qresult, "description", query_desc) for key, value in self._meta.items(): setattr(qresult, key, value) yield qresult
def parse_qresult(self): """Parse a HMMER2 query block.""" while self.read_next(): if not self.line.startswith('Query'): return _, id_ = self.parse_key_value() self.qresult = QueryResult(id=id_) description = None while self.read_next() and not self.line.startswith('Scores'): if self.line.startswith('Accession'): self.qresult.accession = self.parse_key_value()[1] if self.line.startswith('Description'): description = self.parse_key_value()[1] hit_placeholders = self.parse_hits() if len(hit_placeholders) > 0: self.parse_hsps(hit_placeholders) self.parse_hsp_alignments() while not self.line.startswith('Query'): self.read_next() if not self.line: break self.buf.append(self.line) if description is not None: self.qresult.description = description yield self.qresult
def _parse_qresult(self): """Parses a HMMER3 query block.""" self._read_until(lambda line: line.startswith('Query:')) while self.line: # get query id and length regx = re.search(_QRE_ID_LEN, self.line) qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { 'seq_len': int(regx.group(2)), 'program': self._meta.get('program'), 'version': self._meta.get('version'), 'target': self._meta.get('target'), } # get description and accession, if they exist qdesc = '<unknown description>' # placeholder while not self.line.startswith('Scores for '): self.line = read_forward(self.handle) if self.line.startswith('Accession:'): acc = self.line.strip().split(' ', 1)[1] qresult_attrs['accession'] = acc.strip() elif self.line.startswith('Description:'): qdesc = self.line.strip().split(' ', 1)[1].strip() qresult_attrs['description'] = qdesc # parse the query hits while self.line and '//' not in self.line: hit_list = self._parse_hit(qid, qdesc) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith('Internal pipeline'): while self.line and '//' not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield # not initializing hit_list directly to handle empty hits # (i.e. need to set its query description manually) qresult = QueryResult(id=qid, hits=hit_list) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle) # Skip line beginning with '# Alignment of', which are output # when running phmmer with the '-A' flag. if self.line.startswith('# Alignment of'): self.line = self.handle.readline() # HMMER >= 3.1 outputs '[ok]' at the end of all results file, # which means we can break the main loop when we see the line if '[ok]' in self.line: break
def _parse_qresult(self): """Parses a HMMER3 query block.""" self._read_until(lambda line: line.startswith('Query:')) while self.line: # get query id and length regx = re.search(_QRE_ID_LEN, self.line) qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { 'seq_len': int(regx.group(2)), 'program': self._meta.get('program'), 'version': self._meta.get('version'), 'target': self._meta.get('target'), } # get description and accession, if they exist desc = '' # placeholder while not self.line.startswith('Scores for '): self.line = read_forward(self.handle) if self.line.startswith('Accession:'): acc = self.line.strip().split(' ', 1)[1] qresult_attrs['accession'] = acc.strip() elif self.line.startswith('Description:'): desc = self.line.strip().split(' ', 1)[1] qresult_attrs['description'] = desc.strip() # parse the query hits while self.line and '//' not in self.line: hit_list = self._parse_hit(qid) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith('Internal pipeline'): while self.line and '//' not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield # not initializing hit_list directly to handle empty hits # (i.e. need to set its query description manually) qresult = QueryResult(id=qid) for hit in hit_list: if not hit: hit.query_description = qresult.description qresult.append(hit) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle)
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block['hit_id'] frag = HSPFragment(hit_id, query_id) # frag.alphabet = generic_protein if block['query_start']: frag.query_start = block['query_start'] - 1 else: frag.query_start = block['query_start'] frag.query_end = block['query_end'] if block['hit_start']: frag.hit_start = block['hit_start'] - 1 else: frag.hit_start = block['hit_start'] frag.hit_end = block['hit_end'] frag.hit = block['hit_seq'] frag.query = block['query_seq'] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block['description'] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block['evalue'] hsp.score = block['score'] hsp.prob = block['prob'] hsp.hit_seq_len = block['hit_seq_len'] hsp.text = block['text'] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block['description'] hit.is_included = is_included hit.evalue = block['evalue'] hit.score = block['score'] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _create_qresult(self, hit_blocks): """Create the Biopython data structures from the parsed data (PRIVATE).""" query_id = self.query_id hit_dict = OrderedDict() for output_index, block in enumerate(hit_blocks): hit_id = block["hit_id"] frag = HSPFragment(hit_id, query_id) frag.molecule_type = "protein" frag.query_start = block["query_start"] - 1 frag.query_end = block["query_end"] frag.hit_start = block["hit_start"] - 1 frag.hit_end = block["hit_end"] frag.hit = block["hit_seq"] frag.query = block["query_seq"] hsp = HSP([frag]) hsp.hit_id = hit_id hsp.output_index = output_index hsp.query_id = query_id hsp.hit_description = block["description"] is_included = True # Should everything should be included? hsp.is_included = is_included hsp.evalue = block["evalue"] hsp.score = block["score"] hsp.prob = block["prob"] if hit_id not in hit_dict: hit = Hit([hsp], hit_id) hit.description = block["description"] hit.is_included = is_included hit.evalue = block["evalue"] hit.score = block["score"] hit_dict[hit_id] = hit else: hit_dict[hit_id].append(hsp) qresult = QueryResult(hit_dict.values(), query_id) qresult.program = _PROGRAM qresult.seq_len = self.seq_len return [qresult]
def _parse_commented_qresult(self): """Iterator returning `QueryResult` objects from a commented file.""" while True: comments = self._parse_comments() if comments: try: self.fields = comments['fields'] # iterator for the query results qres_iter = self._parse_qresult() except KeyError: # no fields means the query has no results assert 'fields' not in comments # create an iterator returning one empty qresult # if the query has no results qres_iter = iter([QueryResult('')]) for qresult in qres_iter: for key, value in comments.items(): setattr(qresult, key, value) yield qresult else: break
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] cur_qid = None cur_hid = None while True: # store previous line's parsed values, for every line after the 1st if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] else: file_state = state_EOF # mock ID values since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # start creating objects after the first line (i.e. prev is filled) if prev is not None: # each line is basically an HSP with one HSPFragment frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit object when we've finished parsing all its hsps # i.e. when hit state is state_HIT_NEW if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): # state values state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummies qres_state, hit_state = None, None file_state = None cur_qid, cur_hid = None, None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] # if the file has c4 alignments, use that as the alignment mark if self.has_c4_alignment: self._ALN_MARK = 'C4 Alignment:' while True: self.read_until(lambda line: line.startswith(self._ALN_MARK)) if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: assert self.line.startswith(self._ALN_MARK), self.line # create temp dicts for storing parsed values header = {'qresult': {}, 'hit': {}, 'hsp': {}} # if the file has c4 alignments, try to parse the header if self.has_c4_alignment: self.read_until( lambda line: line.strip().startswith('Query:')) header = self._parse_alignment_header() # parse the block contents cur = self.parse_alignment_block(header) cur_qid = cur['qresult']['id'] cur_hid = cur['hit']['id'] elif not self.line or self.line.startswith('-- completed '): file_state = state_EOF cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new query if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) hsp_list.append(hsp) if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) hsp_list = [] if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: # not using append since Exonerate may separate the # same hit if it has different strands qresult.absorb(hit) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult if file_state == state_EOF: break hit_list = [] # only readline() here if we're not parsing C4 alignments # C4 alignments readline() is handled by its parse_alignment_block # function if not self.has_c4_alignment: self.line = self.handle.readline()
def __iter__(self): for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith('>'): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(' ', 1) except ValueError: qid, qdesc = rec.query, '' qdesc = qdesc.replace('\n', '').replace('\r', '') qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine alphabet based on program if qresult.program == 'blastn': alphabet = generic_dna elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']: alphabet = generic_protein # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith('> '): aln.title = aln.title[2:] elif aln.title.startswith('>'): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(' ', 1) except ValueError: hid, hdesc = aln.title, '' hdesc = hdesc.replace('\n', '').replace('\r', '') # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.alphabet = alphabet # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ('blastp', 'tblastn'): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = '' hseq = '' midline = '' for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == ' ' or hchar == ' ': assert all(' ' == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation['similarity'] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _parse_qresult(self): """Parse query result (PRIVATE).""" # initial qresult value qresult = None hit_rows = [] # state values state_QRES_NEW = 1 state_QRES_HITTAB = 3 state_QRES_CONTENT = 5 state_QRES_END = 7 while True: # one line before the hit table if self.line.startswith("The best scores are:"): qres_state = state_QRES_HITTAB # the end of a query or the file altogether elif self.line.strip() == ">>>///" or not self.line: qres_state = state_QRES_END # the beginning of a new query elif not self.line.startswith(">>>") and ">>>" in self.line: qres_state = state_QRES_NEW # the beginning of the query info and its hits + hsps elif self.line.startswith(">>>") and not \ self.line.strip() == ">>><<<": qres_state = state_QRES_CONTENT # default qres mark else: qres_state = None if qres_state is not None: if qres_state == state_QRES_HITTAB: # parse hit table if flag is set hit_rows = self.__parse_hit_table() elif qres_state == state_QRES_END: yield _set_qresult_hits(qresult, hit_rows) break elif qres_state == state_QRES_NEW: # if qresult is filled, yield it first if qresult is not None: yield _set_qresult_hits(qresult, hit_rows) regx = re.search(_RE_ID_DESC_SEQLEN, self.line) query_id = regx.group(1) seq_len = regx.group(3) desc = regx.group(2) qresult = QueryResult(id=query_id) qresult.seq_len = int(seq_len) # get target from the next line self.line = self.handle.readline() qresult.target = [x for x in self.line.split(" ") if x][1].strip() if desc is not None: qresult.description = desc # set values from preamble for key, value in self._preamble.items(): setattr(qresult, key, value) elif qres_state == state_QRES_CONTENT: assert self.line[3:].startswith(qresult.id), self.line for hit, strand in self._parse_hit(query_id): # HACK: re-set desc, for hsp hit and query description hit.description = hit.description hit.query_description = qresult.description # if hit is not in qresult, append it if hit.id not in qresult: qresult.append(hit) # otherwise, it might be the same hit with a different strand else: # make sure strand is different and then append hsp to # existing hit for hsp in hit.hsps: assert strand != hsp.query_strand qresult[hit.id].append(hsp) self.line = self.handle.readline()
def _parse_qresult(self): """Parse a HMMER3 query block (PRIVATE).""" self._read_until(lambda line: line.startswith("Query:")) while self.line: regx = re.search(_QRE_ID_LEN, self.line) while not regx: self.line = read_forward(self.handle) regx = re.search(_QRE_ID_LEN, self.line) # get query id and length qid = regx.group(1).strip() # store qresult attributes qresult_attrs = { "seq_len": int(regx.group(2)), "program": self._meta.get("program"), "version": self._meta.get("version"), "target": self._meta.get("target"), } # get description and accession, if they exist qdesc = "<unknown description>" # placeholder while not self.line.startswith("Scores for "): self.line = read_forward(self.handle) if self.line.startswith("Accession:"): acc = self.line.strip().split(" ", 1)[1] qresult_attrs["accession"] = acc.strip() elif self.line.startswith("Description:"): qdesc = self.line.strip().split(" ", 1)[1].strip() qresult_attrs["description"] = qdesc # parse the query hits while self.line and "//" not in self.line: hit_list = self._parse_hit(qid, qdesc) # read through the statistics summary # TODO: parse and store this information? if self.line.startswith("Internal pipeline"): while self.line and "//" not in self.line: self.line = read_forward(self.handle) # create qresult, set its attributes and yield # not initializing hit_list directly to handle empty hits # (i.e. need to set its query description manually) qresult = QueryResult(id=qid, hits=hit_list) for attr, value in qresult_attrs.items(): setattr(qresult, attr, value) yield qresult self.line = read_forward(self.handle) # Skip line beginning with '# Alignment of', which are output # when running phmmer with the '-A' flag. if self.line.startswith("#"): self.line = self.handle.readline() # HMMER >= 3.1 outputs '[ok]' at the end of all results file, # which means we can break the main loop when we see the line if "[ok]" in self.line: break
def __iter__(self): """Iterate over BlastTextParser, yields query results.""" for rec in self.blast_iter: # set attributes to SearchIO's # get id and desc if rec.query.startswith(">"): rec.query = rec.query[1:] try: qid, qdesc = rec.query.split(" ", 1) except ValueError: qid, qdesc = rec.query, "" qdesc = qdesc.replace("\n", "").replace("\r", "") qresult = QueryResult(id=qid) qresult.program = rec.application.lower() qresult.target = rec.database qresult.seq_len = rec.query_letters qresult.version = rec.version # determine molecule_type based on program if qresult.program == "blastn": molecule_type = "DNA" elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: molecule_type = "protein" # iterate over the 'alignments' (hits) and the hit table for idx, aln in enumerate(rec.alignments): # get id and desc if aln.title.startswith("> "): aln.title = aln.title[2:] elif aln.title.startswith(">"): aln.title = aln.title[1:] try: hid, hdesc = aln.title.split(" ", 1) except ValueError: hid, hdesc = aln.title, "" hdesc = hdesc.replace("\n", "").replace("\r", "") # iterate over the hsps and group them in a list hsp_list = [] for bhsp in aln.hsps: frag = HSPFragment(hid, qid) frag.molecule_type = molecule_type # set alignment length frag.aln_span = bhsp.identities[1] # set frames try: frag.query_frame = int(bhsp.frame[0]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.query_frame = 0 else: frag.query_frame = 1 try: frag.hit_frame = int(bhsp.frame[1]) except IndexError: if qresult.program in ("blastp", "tblastn"): frag.hit_frame = 0 else: frag.hit_frame = 1 # set query coordinates frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 frag.query_end = max(bhsp.query_start, bhsp.query_end) # set hit coordinates frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) # set query, hit sequences and its annotation qseq = "" hseq = "" midline = "" for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): qchar, hchar, mchar = seqtrio if qchar == " " or hchar == " ": assert all(" " == x for x in seqtrio) else: qseq += qchar hseq += hchar midline += mchar frag.query, frag.hit = qseq, hseq frag.aln_annotation["similarity"] = midline # create HSP object with the fragment hsp = HSP([frag]) hsp.evalue = bhsp.expect hsp.bitscore = bhsp.bits hsp.bitscore_raw = bhsp.score # set gap try: hsp.gap_num = bhsp.gaps[0] except IndexError: hsp.gap_num = 0 # set identity hsp.ident_num = bhsp.identities[0] hsp.pos_num = bhsp.positives[0] if hsp.pos_num is None: hsp.pos_num = hsp[0].aln_span hsp_list.append(hsp) hit = Hit(hsp_list) hit.seq_len = aln.length hit.description = hdesc qresult.append(hit) qresult.description = qdesc yield qresult
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 # initial value dummies qres_state = None file_state = None prev_qid = None cur, prev = None, None # container for Hit objects, used to create QueryResult hit_list = [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid # only parse the result row if it's not EOF if self.line: cur = self._parse_row() cur_qid = cur['qresult']['id'] else: file_state = state_EOF # mock value for cur_qid, since we have nothing to parse cur_qid = None if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME if prev is not None: # since domain tab formats only have 1 Hit per line # we always create HSPFragment, HSP, and Hit per line prev_hid = prev['hit']['id'] # create fragment and HSP and set their attributes frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) # create Hit and set its attributes hit = Hit([hsp]) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(prev_qid, hits=hit_list) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): """Return QueryResult objects (PRIVATE).""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 # initial value dummies qres_state = None file_state = None prev_qid = None cur, prev = None, None # container for Hit objects, used to create QueryResult hit_list = [] cur_qid = None while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid # only parse the result row if it's not EOF # NOTE: we are not parsing the extra '#' lines appended to the end # of hmmer31b1 tabular results since storing them in qresult # objects means we can not do a single-pass parsing if self.line and not self.line.startswith('#'): cur = self._parse_row() cur_qid = cur['qresult']['id'] else: file_state = state_EOF # mock value for cur_qid, since we have nothing to parse cur_qid = None if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME if prev is not None: # since domain tab formats only have 1 Hit per line # we always create HSPFragment, HSP, and Hit per line prev_hid = prev['hit']['id'] # create fragment and HSP and set their attributes frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): setattr(frag, attr, value) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) # create Hit and set its attributes hit = Hit([hsp]) for attr, value in prev['hit'].items(): setattr(hit, attr, value) hit_list.append(hit) # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): """Parse query results (PRIVATE).""" # parse the queries for event, qresult_elem in self.xml_iter: # </Iteration> marks the end of a single query # which means we can process it if event == 'end' and qresult_elem.tag == 'Iteration': # we'll use the following schema # <!ELEMENT Iteration ( # Iteration_iter-num, # Iteration_query-ID?, # Iteration_query-def?, # Iteration_query-len?, # Iteration_hits?, # Iteration_stat?, # Iteration_message?)> # assign query attributes with fallbacks query_id = qresult_elem.findtext('Iteration_query-ID') if query_id is None: query_id = self._fallback['id'] query_desc = qresult_elem.findtext('Iteration_query-def') if query_desc is None: query_desc = self._fallback['description'] query_len = qresult_elem.findtext('Iteration_query-len') if query_len is None: query_len = self._fallback['len'] blast_query_id = query_id # handle blast searches against databases with Blast's IDs # 'Query_' marks the beginning of a BLAST+-generated ID, # 'lcl|' marks the beginning of a BLAST legacy-generated ID if not self._use_raw_query_ids and \ (query_id.startswith('Query_') or query_id.startswith('lcl|')): # store the Blast-generated query ID id_desc = query_desc.split(' ', 1) query_id = id_desc[0] try: query_desc = id_desc[1] except IndexError: query_desc = '' hit_list, key_list = [], [] for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), query_id): if hit: # need to keep track of hit IDs, since there could be duplicates, if hit.id in key_list: warnings.warn( "Renaming hit ID %r to a BLAST-generated ID " "%r since the ID was already matched " "by your query %r. Your BLAST database " "may contain duplicate entries." % (hit.id, hit.blast_id, query_id), BiopythonParserWarning) # fallback to Blast-generated IDs, if the ID is already present # and restore the desc, too hit.description = '%s %s' % (hit.id, hit.description) hit.id = hit.blast_id # and change the hit_id of the HSPs contained for hsp in hit: hsp.hit_id = hit.blast_id else: key_list.append(hit.id) hit_list.append(hit) # create qresult and assign its attributes qresult = QueryResult(hit_list, query_id) qresult.description = query_desc qresult.seq_len = int(query_len) qresult.blast_id = blast_query_id for key, value in self._meta.items(): setattr(qresult, key, value) # statistics are stored in Iteration_stat's 'grandchildren' with the # following DTD # <!ELEMENT Statistics ( # Statistics_db-num, # Statistics_db-len, # Statistics_hsp-len, # Statistics_eff-space, # Statistics_kappa, # Statistics_lambda, # Statistics_entropy)> stat_iter_elem = qresult_elem.find('Iteration_stat') if stat_iter_elem is not None: stat_elem = stat_iter_elem.find('Statistics') for key, val_info in _ELEM_QRESULT_OPT.items(): value = stat_elem.findtext(key) if value is not None: caster = val_info[1] # recast only if value is not intended to be str if value is not None and caster is not str: value = caster(value) setattr(qresult, val_info[0], value) # delete element after we finish parsing it qresult_elem.clear() yield qresult
def _parse_qresult(self): """Yield QueryResult objects (PRIVATE).""" # state values, determines what to do for each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # initial dummy values qres_state = None file_state = None cur_qid, cur_hid = None, None prev_qid, prev_hid = None, None cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values for all lines after the first if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the result row if it's not EOF if self.line: cur = self._parse_row() cur_qid = cur["qname"] cur_hid = cur["tname"] else: file_state = state_EOF # mock values, since we have nothing to parse cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different ids or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME if prev is not None: # create fragment and HSP and set their attributes hsp = _create_hsp(prev_hid, prev_qid, prev) hsp_list.append(hsp) if hit_state == state_HIT_NEW: # create Hit and set its attributes hit = Hit(hsp_list) hit.seq_len = prev["tsize"] hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or at EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(id=prev_qid) for hit in hit_list: qresult.absorb(hit) qresult.seq_len = prev["qsize"] yield qresult # if we're at EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline()
def _parse_qresult(self): """Generator function that returns QueryResult objects.""" # state values, used to determine what to do with each line state_EOF = 0 state_QRES_NEW = 1 state_QRES_SAME = 3 state_HIT_NEW = 2 state_HIT_SAME = 4 # dummies for initial states qres_state = None hit_state = None file_state = None cur_qid = None cur_hid = None # dummies for initial id caches prev_qid = None prev_hid = None # dummies for initial parsed value containers cur, prev = None, None hit_list, hsp_list = [], [] while True: # store previous line's parsed values if we've past the first line if cur is not None: prev = cur prev_qid = cur_qid prev_hid = cur_hid # only parse the line if it's not EOF or not a comment line if self.line and not self.line.startswith('#'): cur = self._parse_result_row() cur_qid = self._get_id(cur['qresult']) cur_hid = self._get_id(cur['hit']) else: file_state = state_EOF # mock values for cur_qid and cur_hid since the line is empty cur_qid, cur_hid = None, None # get the state of hit and qresult if prev_qid != cur_qid: qres_state = state_QRES_NEW else: qres_state = state_QRES_SAME # new hits are hits with different id or hits in a new qresult if prev_hid != cur_hid or qres_state == state_QRES_NEW: hit_state = state_HIT_NEW else: hit_state = state_HIT_SAME # we're creating objects for the previously parsed line(s), # so nothing is done in the first parsed line (prev == None) if prev is not None: # every line is essentially an HSP with one fragment, so we # create both of these for every line frag = HSPFragment(prev_hid, prev_qid) for attr, value in prev['frag'].items(): # adjust coordinates to Python range # NOTE: this requires both start and end coords to be # present, otherwise a KeyError will be raised. # Without this limitation, we might misleadingly set the # start / end coords for seq_type in ('query', 'hit'): if attr == seq_type + '_start': value = min(value, prev['frag'][seq_type + '_end']) - 1 elif attr == seq_type + '_end': value = max(value, prev['frag'][seq_type + '_start']) setattr(frag, attr, value) # strand and frame setattr require the full parsed values # to be set first for seq_type in ('hit', 'query'): # try to set hit and query frame frame = self._get_frag_frame(frag, seq_type, prev['frag']) setattr(frag, '%s_frame' % seq_type, frame) # try to set hit and query strand strand = self._get_frag_strand(frag, seq_type, prev['frag']) setattr(frag, '%s_strand' % seq_type, strand) hsp = HSP([frag]) for attr, value in prev['hsp'].items(): setattr(hsp, attr, value) hsp_list.append(hsp) # create hit and append to temp hit container if hit_state # says we're not at the same hit or at a new query if hit_state == state_HIT_NEW: hit = Hit(hsp_list) for attr, value in prev['hit'].items(): if attr != 'id_all': setattr(hit, attr, value) else: # not setting hit ID since it's already set from the # prev_hid above setattr(hit, '_id_alt', value[1:]) hit_list.append(hit) hsp_list = [] # create qresult and yield if we're at a new qresult or EOF if qres_state == state_QRES_NEW or file_state == state_EOF: qresult = QueryResult(hit_list, prev_qid) for attr, value in prev['qresult'].items(): setattr(qresult, attr, value) yield qresult # if current line is EOF, break if file_state == state_EOF: break hit_list = [] self.line = self.handle.readline().strip()
def test_store_bio_searchio_blast_record(self): """Run Tests - __init__ and store_searchio_blast_record.""" null_db, created = Db.objects.get_or_create(name="null") null_cv, created = Cv.objects.get_or_create(name="null") null_dbxref, created = Dbxref.objects.get_or_create(accession="null", db=null_db) null_cvterm, created = Cvterm.objects.get_or_create( name="null", cv=null_cv, dbxref=null_dbxref, is_obsolete=0, is_relationshiptype=0, ) null_pub, created = Pub.objects.get_or_create(uniquename="null", type=null_cvterm, is_obsolete=False) test_organism = Organism.objects.create(genus="Mus", species="musculus") test_organism2, created = Organism.objects.get_or_create( abbreviation="multispecies", genus="multispecies", species="multispecies", common_name="multispecies", ) # creating test SO term test_db = Db.objects.create(name="SO") test_cv = Cv.objects.create(name="sequence") test_db2 = Db.objects.create(name="RO") test_cv2 = Cv.objects.create(name="relationship") test_dbxref = Dbxref.objects.create(accession="123456", db=test_db) test_dbxref2 = Dbxref.objects.create(accession="7890", db=test_db) test_aa_term = Cvterm.objects.create( name="polypeptide", cv=test_cv, dbxref=test_dbxref, is_obsolete=0, is_relationshiptype=0, ) test_aa_term2 = Cvterm.objects.create( name="protein_match", cv=test_cv, dbxref=test_dbxref2, is_obsolete=0, is_relationshiptype=0, ) test_dbxref3 = Dbxref.objects.create(accession="1234567", db=test_db) Cvterm.objects.create( name="match_part", cv=test_cv, dbxref=test_dbxref3, is_obsolete=0, is_relationshiptype=0, ) test_dbxref4 = Dbxref.objects.create(accession="12345678", db=test_db2) Cvterm.objects.create( name="contained in", cv=test_cv2, dbxref=test_dbxref4, is_obsolete=0, is_relationshiptype=1, ) test_dbxref5 = Dbxref.objects.create(accession="12345679", db=test_db2) Cvterm.objects.create( name="in similarity relationship with", cv=test_cv2, dbxref=test_dbxref5, is_obsolete=0, is_relationshiptype=1, ) test_dbxref6 = Dbxref.objects.create(accession="22345679", db=test_db2) cvterm_translation = Cvterm.objects.create( name="translation_of", cv=test_cv, dbxref=test_dbxref6, is_obsolete=0, is_relationshiptype=1, ) test_dbxref7 = Dbxref.objects.create(accession="223456", db=test_db) test_mrna_term = Cvterm.objects.create( name="mRNA", cv=test_cv, dbxref=test_dbxref7, is_obsolete=0, is_relationshiptype=0, ) test_db_pfam = Db.objects.create(name="PFAM") test_cv_pfam = Cv.objects.create(name="PFAM") test_dbxref_pfam_term = Dbxref.objects.create(accession="123", db=test_db_pfam) test_cvterm_pfam_term = Cvterm.objects.create( name="kinase", cv=test_cv_pfam, dbxref=test_dbxref_pfam_term, is_obsolete=0, is_relationshiptype=0, ) # creating test features feature_db = Db.objects.create(name="FASTA_SOURCE") feature_dbxref1 = Dbxref.objects.create(db=feature_db, accession="feat1") feature_dbxref2 = Dbxref.objects.create(db=feature_db, accession="feat2") feature_dbxref3 = Dbxref.objects.create(db=feature_db, accession="feat3") feature_dbxref4 = Dbxref.objects.create(db=feature_db, accession="feat4") feature_dbxref5 = Dbxref.objects.create(db=feature_db, accession="feat5") feature_dbxref1m = Dbxref.objects.create(db=feature_db, accession="feat1m") feature_dbxref2m = Dbxref.objects.create(db=feature_db, accession="feat2m") feature_dbxref3m = Dbxref.objects.create(db=feature_db, accession="feat3m") feature_dbxref4m = Dbxref.objects.create(db=feature_db, accession="feat4m") feature_dbxref5m = Dbxref.objects.create(db=feature_db, accession="feat5m") f1 = Feature.objects.create( organism=test_organism, uniquename="feat1", is_analysis=False, type_id=test_aa_term.cvterm_id, is_obsolete=False, dbxref=feature_dbxref1, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f2 = Feature.objects.create( organism=test_organism2, uniquename="feat2", is_analysis=False, type_id=test_aa_term2.cvterm_id, is_obsolete=False, dbxref=feature_dbxref2, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f3 = Feature.objects.create( organism=test_organism2, uniquename="feat3", is_analysis=False, type_id=test_aa_term2.cvterm_id, is_obsolete=False, dbxref=feature_dbxref3, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f4 = Feature.objects.create( organism=test_organism, uniquename="feat4", is_analysis=False, type_id=test_aa_term.cvterm_id, is_obsolete=False, dbxref=feature_dbxref4, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f5 = Feature.objects.create( organism=test_organism2, uniquename="feat5", is_analysis=False, type_id=test_aa_term2.cvterm_id, is_obsolete=False, dbxref=feature_dbxref5, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f1m = Feature.objects.create( organism=test_organism, uniquename="feat1m", is_analysis=False, type=test_mrna_term, is_obsolete=False, dbxref=feature_dbxref1m, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f2m = Feature.objects.create( organism=test_organism2, uniquename="feat2m", is_analysis=False, type=test_mrna_term, is_obsolete=False, dbxref=feature_dbxref2m, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f3m = Feature.objects.create( organism=test_organism2, uniquename="feat3m", is_analysis=False, type=test_mrna_term, is_obsolete=False, dbxref=feature_dbxref3m, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f4m = Feature.objects.create( organism=test_organism, uniquename="feat4m", is_analysis=False, type=test_mrna_term, is_obsolete=False, dbxref=feature_dbxref4m, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) f5m = Feature.objects.create( organism=test_organism2, uniquename="feat5m", is_analysis=False, type=test_mrna_term, is_obsolete=False, dbxref=feature_dbxref5m, timeaccessioned=datetime.now(), timelastmodified=datetime.now(), ) FeatureRelationship.objects.create(subject=f1m, object=f1, type=cvterm_translation, rank=0) FeatureRelationship.objects.create(subject=f2m, object=f2, type=cvterm_translation, rank=0) FeatureRelationship.objects.create(subject=f3m, object=f3, type=cvterm_translation, rank=0) FeatureRelationship.objects.create(subject=f4m, object=f4, type=cvterm_translation, rank=0) FeatureRelationship.objects.create(subject=f5m, object=f5, type=cvterm_translation, rank=0) FeatureCvterm.objects.create(feature=f3, cvterm=test_cvterm_pfam_term, pub=null_pub, is_not=False, rank=0) test_HSPFragment1 = HSPFragment("feat1", "feat2") setattr(test_HSPFragment1, "query_start", 110) setattr(test_HSPFragment1, "query_end", 1100) setattr(test_HSPFragment1, "aln_span", 990) setattr(test_HSPFragment1, "hit_start", 100) setattr(test_HSPFragment1, "hit_end", 1000) test_HSP1 = HSP([test_HSPFragment1]) setattr(test_HSP1, "query_id", "feat1") setattr(test_HSP1, "hit_id", "feat2") setattr(test_HSP1, "bitscore", 1234.0) setattr(test_HSP1, "bitscore_raw", 1234) setattr(test_HSP1, "evalue", 0.0) setattr(test_HSP1, "ident_num", 82) test_HIT1 = Hit([test_HSP1]) setattr(test_HIT1, "accession", "5050") setattr(test_HIT1, "seq_len", 2000) test_HSPFragment2 = HSPFragment("feat1", "feat3") setattr(test_HSPFragment2, "query_start", 210) setattr(test_HSPFragment2, "query_end", 2100) setattr(test_HSPFragment2, "aln_span", 1890) setattr(test_HSPFragment2, "hit_start", 200) setattr(test_HSPFragment2, "hit_end", 2000) test_HSP2 = HSP([test_HSPFragment2]) setattr(test_HSP2, "query_id", "feat1") setattr(test_HSP2, "hit_id", "feat3") setattr(test_HSP2, "bitscore", 234.0) setattr(test_HSP2, "bitscore_raw", 234) setattr(test_HSP2, "evalue", 0.0) setattr(test_HSP2, "ident_num", 72) test_HIT2 = Hit([test_HSP2]) setattr(test_HIT2, "accession", "500") setattr(test_HIT2, "seq_len", 4000) test_result1 = QueryResult([test_HIT1, test_HIT2], "feat1") setattr(test_result1, "seq_len", 3000) setattr(test_result1, "blast_id", "feat1") # test retrieve_query_from_hsp and retrieve_subject_from_hsp # test hsp with no bitscore, bitscore_raw, evalue, and ident_num test_HSPFragment3 = HSPFragment("feat4_desc", "feat5_desc") setattr(test_HSPFragment3, "query_start", 210) setattr(test_HSPFragment3, "query_end", 2100) setattr(test_HSPFragment3, "aln_span", 1890) setattr(test_HSPFragment3, "hit_start", 200) setattr(test_HSPFragment3, "hit_end", 2000) test_HSP3 = HSP([test_HSPFragment3]) setattr(test_HSP3, "query_id", "feat4_desc") setattr(test_HSP3, "query_description", "test id=feat4") setattr(test_HSP3, "hit_id", "feat5_desc") setattr(test_HSP3, "hit_description", "test id=feat5") test_HIT3 = Hit([test_HSP3]) setattr(test_HIT3, "seq_len", 4000) test_result2 = QueryResult([test_HIT3], "feat4_desc") setattr(test_result2, "seq_len", 3000) setattr(test_result2, "blast_id", "feat4_desc") # test SimilarityLoader fail with self.assertRaises(ImportingError): SimilarityLoader( filename="similarity.file", algorithm="smith-waterman", description="command-line example", program="blastp", input_format="blast-xml", programversion="2.2.31+", so_query="polypeptide", so_subject="protein_match", org_query="H**o sapiens", org_subject="multispecies multispecies", ) test_blast_file = SimilarityLoader( filename="similarity.file", algorithm="smith-waterman", description="command-line example", program="interproscan", input_format="interproscan-xml", programversion="5", so_query="polypeptide", so_subject="protein_match", org_query="Mus musculus", org_subject="multispecies multispecies", ) test_blast_file.store_bio_searchio_query_result(test_result1) test_blast_file.store_bio_searchio_query_result(test_result2) test_analysis = Analysis.objects.get(sourcename="similarity.file") self.assertEqual("interproscan", test_analysis.program) test_featureloc = Featureloc.objects.get(srcfeature=f3) test_analysisfeature = Analysisfeature.objects.get( analysis=test_analysis, feature_id=test_featureloc.feature_id) self.assertEqual(234.0, test_analysisfeature.rawscore) # test remove_feature self.assertTrue( Analysis.objects.filter(sourcename="similarity.file").exists()) call_command("remove_analysis", "--name=similarity.file", "--verbosity=0") self.assertFalse( Analysis.objects.filter(sourcename="similarity.file").exists())