def filter_func(row): qrec = row.query_record qrec = SeqIO.read(StringIO(qrec), 'fasta') min_len = 0.25 * len(qrec) intro = row.iloc[0:6].tolist() hits = row.iloc[5:-1].tolist() new_hits = [] for hit in hits: if hit == 'NA': new_hits.append(None) continue elif hit is not None: tmp = '>' + hit else: new_hits.append(None) continue hit = SeqIO.read(StringIO(tmp), 'fasta') id_lst = hit.id _, hit_symbol, seq_range, _ = id_search(id_lst, id_type='brute', verbose=0) try: seq_range = seq_range[hit_symbol] except KeyError: new_hits.append(None) continue seq_len = abs(int(seq_range[1]) - int(seq_range[0])) new_hits.append(hit.description if seq_len >= min_len else None) full = intro + new_hits return full
def map_ranges(hit): """ Convenience function for RBC.results_map(). Replaces results with a tup of result descriptions and loci.""" _, h_id, h_range, _ = id_search(hit.description, verbose=False) h_start = h_range[0] h_end = h_range[1] h_strand = h_range[2] h_d = (hit.description, h_id, h_start, h_end, h_strand) return h_d
def fasta(id_item, seq_range, database, source, indent, verbose): regex = id_search(id_item, indent=indent, verbose=verbose, regex_only=True) seqdict = SeqIO.index(database, source, key_function=lambda identifier: regex.search( identifier).groups()[0]) itemnotfound = id_item if id_item not in seqdict.keys() else None seq = seqdict[id_item] seq = seq[slice(seq_range[0], seq_range[1])] return seq, itemnotfound
def fun(self, hit, stat, verbose=False): pat = re.compile('\|\[(.*?):.*\]\|') # regex for items in annotation try: hit_split = hit.description.split('|-|') top_anno = hit_split[1] except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) return False except IndexError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) return False id_lst = pat.findall(top_anno)[0].strip() if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=verbose) if stat == hit_symbol: return True else: return False
def count_reciprocal_best_hits_from_pandas(pandas_df): pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation spec_list = list(pandas_df.target_species.unique()) species_counters = {} for species in spec_list: species_counters[species] = Counter() species_results = pandas_df.loc[pandas_df['target_species'] == species] query_list = list(species_results.query_name.unique()) for query in query_list: print(query) query_results = species_results.loc[species_results['query_name'] == query].ix[:, 5:-1] rc_out = [] for i, d in query_results.iterrows(): rc_out += d.tolist() # Annoying shunt rc_out_asfasta = '\n'.join(['>' + i for i in rc_out if i is not None]) tmp = StringIO(rc_out_asfasta) rc_out = SeqIO.parse(tmp, 'fasta') for hit in rc_out: try: hit_split = hit.description.split('|-|') id_lst = ''.join(pat.findall(hit_split[1])) except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0) else: print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name, species, query)) continue if query == hit_symbol: species_counters[species].update({query: 1}) return species_counters
def count_reciprocal_best_hits(recblast_out): pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation species_counters = {} for species, species_dict in recblast_out.items(): species_counters[species] = Counter() for query, query_dict in species_dict.items(): try: rc_out = query_dict['recblast_results'] except KeyError: print('No entries in recblast_results for query {0} in species {1}'.format(query, species)) continue for hit in rc_out: try: hit_split = hit.description.split('|-|') target_id = hit_split[0] annotations = hit_split[1] except ValueError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue except IndexError: print(hit.description, indent=2) print('Could not unpack annotations!', indent=2) continue id_lst = ''.join(pat.findall(annotations)) if id_lst: _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0) else: print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name, species, query)) continue if query == hit_symbol: species_counters[species].update({query: 1}) return species_counters
def simple_struct(recblast_out, verbose=True): """Returns a nice diagram of queries, targets, and annotations""" master_dict = {} pat = re.compile('\|\[(.*?)\]\|') # regex for items in annotation if isinstance(recblast_out, list): # Prepare a list of dictionaries of length recblast_out, along with a list of respective species master_count = [dict] * len(recblast_out) for index, rc in enumerate(recblast_out): try: master_count[index] = simple_struct(rc) except AttributeError: master_count[index] = rc for subdict in master_count: for species, species_dict in subdict.items(): if isinstance(species_dict, Exception): continue try: comb_spec_dict = master_dict[species] except KeyError: master_dict[species] = dict() comb_spec_dict = master_dict[species] for query, query_dict in species_dict.items(): try: comb_query_dict = comb_spec_dict[query] except KeyError: comb_spec_dict[query] = dict() comb_query_dict = comb_spec_dict[query] for target_id, annotation_list in query_dict.items(): try: comb_anno_list = comb_query_dict[target_id] except KeyError: comb_query_dict[target_id] = list() comb_anno_list = comb_query_dict[target_id] comb_anno_list += annotation_list if isinstance(annotation_list, list) else [annotation_list] return master_dict else: """ Structure: master_dict: Species| species_dict: Query| query_dict: target_id| annotations_list """ # assert isinstance(recblast_out, RecBlastContainer), 'Item in recblast_out was not a RecBlastContainer object!' try: recblast_out.__delitem__('__dict__') except KeyError: pass for species, rc_spec_rec in recblast_out.items(): # print('Species:\t', species, indent=0) try: species_dict = master_dict[species] except KeyError: master_dict[species] = dict() species_dict = master_dict[species] for query, rc_rec in rc_spec_rec.items(): # print('Query:\t', query, indent=1) try: query_dict = species_dict[query] except KeyError: species_dict[query] = dict() query_dict = species_dict[query] try: rc_out = rc_rec['recblast_results'] except KeyError: print('No entries in recblast_results for query {0} in species {1}'.format(query, species)) continue for record in rc_out: try: # print(record.description, indent=3) target_id, annotations = record.description.split('|-|') # print('Target ID:\t', target_id, indent=4) # print('Annotations:', annotations.lstrip('\t'), indent=4) except ValueError: print(record.description, indent=2) # print('Could not unpack annotations!', indent=2) continue try: target_list = query_dict[target_id] except KeyError: query_dict[target_id] = list() target_list = query_dict[target_id] id_lst = pat.findall(annotations) # print('id_list:\t', id_lst, indent=4) if id_lst: target_list += id_lst else: print('No annotations found for record {0} in species {1}, query {2}'.format(record.name, species, query)) if verbose: print('*******************************************') for species, species_dict in master_dict.items(): print(species, indent=0) for query, query_dict in species_dict.items(): print(query, indent=1) for target_id, annotation_list in query_dict.items(): print(target_id, indent=2) tmp = [] for annotation in annotation_list: p, item, seq_range, id_type = id_search(annotation, id_type='brute', verbose=0) if id_type == 'symbol': tmp.append(item) else: tmp.append(item) query_dict[target_id] = tmp for annotation in query_dict[target_id]: print(annotation, indent=3) print('*******************************************') return master_dict