def parse_blast_output (ref_protein_id, species, blast): ''' @return: Dictionary where key is reference species exon_id, and the value is list of corresponding alignments ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() if blast == "blastn": blast_file = "{0}/{1}.blastout".format(dc.get_blastn_path(ref_protein_id), species) else: blast_file = "{0}/{1}.blastout".format(dc.get_tblastn_path(ref_protein_id), species) if not os.path.isfile(blast_file): containers_logger.error ("{0}, {1}, {2}, no blastout file".format(ref_protein_id, species, blast)) return None file_handle = open(blast_file, 'r') # parse blastn output try: blastn_record = NCBIXML.read(file_handle) except ValueError: containers_logger.error("%s,%s,%s,No hits found" % (ref_protein_id, species, blast)) return None exon_dict = {} exon_pattern = re.compile("(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") for alignment in blastn_record.alignments: (blast_info, exon_info) = alignment.title.split() pattern_match = re.match(exon_pattern, exon_info) ref_exon_id = pattern_match.groups()[3] exon_start = int (pattern_match.groups()[0]) exon_end = int(pattern_match.groups()[1]) # limit alignments to 10 hsps num_of_hsps = 0 for hsp in alignment.hsps: # limit! if blast == "blastn": (query_frame, hit_frame) = hsp.frame if query_frame == -1 or hit_frame == -1: continue if num_of_hsps == 5: break num_of_hsps += 1 exon = Exon(blast, ref_exon_id, ref_protein_id, species) if type(hsp.gaps) is int: gaps = hsp.gaps elif type(hsp.gaps) is tuple: if not hsp.gaps[0]: gaps = 0 exon.set_alignment_info ( hsp.identities, hsp.positives, gaps, hsp.sbjct_start, hsp.sbjct_start + len(hsp.sbjct) -1, hsp.query_start, hsp.query_start + len(hsp.sbjct) -1, len(hsp.sbjct), hsp.sbjct, hsp.query, hsp.score) if not ref_exon_id in exon_dict: exon_dict[ref_exon_id] = [exon] else: exon_dict[ref_exon_id].append(exon) # means we covered the whole exon if len(hsp.sbjct) == abs(exon_end-exon_start)+1 and len(hsp.sbjct) == hsp.identities: break file_handle.close() return exon_dict
def parse_SW_output (ref_protein_id, species, sw_type): ''' Parses the output from the SW# command line application. (suitable for version as it was distributed on May 1st, 2012) @param sw_type: sw_exon/sw_gene @return: dictionary of alignment exons. The keys are referent exon IDs, and values are lists of all the alignment exons which correspond to the certain reference exon ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() # determine the swout file path if sw_type.lower() == "sw_gene": swout_file_path = dc.get_SW_gene_path(ref_protein_id) elif sw_type.lower() == "sw_exon": swout_file_path = dc.get_SW_exon_path(ref_protein_id) else: raise KeyError ("There is no known swout path for type %s" % sw_type) swout_file_path += "/%s.swout" % species if not os.path.isfile(swout_file_path): containers_logger.error ("{0}, {1}, {2}, no swout file".format(ref_protein_id, species, sw_type)) return False swout_file = open(swout_file_path, 'r') # status boolean variables parsing_query_seq = True # patterns for matching header_pattern = re.compile ("Name: >(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") #Intervals: 1207047 1207087 30 69 (+) strand intervals_pattern = re.compile ("Intervals:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\([+-]\)\s+strand") #Identity: 31/41 (75.6%) identity_pattern = re.compile ("Identity:\s+(\d+)/(\d+).*") #Similarity: 40/41 (97.6%) similarity_pattern = re.compile ("Similarity:\s+(\d+)/.*") #Gaps: 1/41 (2.4%) gaps_pattern = re.compile ("Gaps:\s+(\d+)/\d+.*") #Score: 2828.000 score_pattern = re.compile ("Score:.*") # sequence pattern sequence_pattern = re.compile ("\s*(\d+)\s+([ATCGN-]+)\s+(\d+).*") exon_dict = {} ref_exon_id = "" identities = 0 positives = 0 gaps = 0 score = 0. sbjct_start = 0 sbjct_end = 0 query_start = 0 query_end = 0 length = 0 query_sequence = "" sbjct_sequence = "" exon = Exon(sw_type, "", ref_protein_id, species) for line in swout_file.readlines(): line = line.strip() header_match = re.match(header_pattern, line) if header_match: #add the current exon and start a new one if ref_exon_id: exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] ref_exon_id = header_match.groups()[3] exon = Exon(sw_type, ref_exon_id, ref_protein_id, species) parsing_query_seq = True query_sequence = "" sbjct_sequence = "" # intervals intervals_match = re.match (intervals_pattern, line) if intervals_match: (query_start, query_end, sbjct_start, sbjct_end) = intervals_match.groups() # identities identity_match = re.match (identity_pattern, line) if identity_match: (identities, length) = identity_match.groups() # similarities similarity_match = re.match(similarity_pattern, line) if similarity_match: positives = similarity_match.groups()[0] # gaps gaps_match = re.match (gaps_pattern, line) if gaps_match: gaps = gaps_match.groups()[0] score_match = re.match(score_pattern, line) if score_match: score = line.split()[-1] # sequence sequence_match = re.match (sequence_pattern, line) if sequence_match: sequence_to_append = sequence_match.groups()[1].strip() if parsing_query_seq: query_sequence += sequence_to_append parsing_query_seq = False else: sbjct_sequence += sequence_to_append parsing_query_seq = True exon.set_alignment_info(int(identities), int(positives), int(gaps), int(sbjct_start), int(sbjct_end), int(query_start), int(query_end), int(length), sbjct_sequence, query_sequence, float(score)) if query_sequence: if ref_exon_id in exon_dict: exon_dict[ref_exon_id].append(exon) else: exon_dict[ref_exon_id] = [exon] return exon_dict