def get_islands(species_name): islands = [] species_hits_list = [] # Add mongo_record for each hit in any gene all_hits = kv.get_collection('hits') for query_id in species_hits: if species_hits[query_id]: species_hits_list.append( kv.get_mongo_record(species_name, query_id) ) for entry_1 in species_hits_list: entry_recorded = False for entry_2 in species_hits_list: if entry_1 == entry_2: pass elif entry_1['location']['contig'] != entry_2['location']['contig']: pass else: location_1 = entry_1['location'] location_2 = entry_2['location'] if abs(location_1['end'] - location_2['start']) <= 5000: entry_recorded = True islands.append([ (entry_1['species'], str(entry_1['_id'])), (entry_2['species'], str(entry_2['_id'])) ]) if not entry_recorded: islands.append([(entry_1['species'], str(entry_1['_id']))]) return collapse_lists(islands)
def get_links(group=None, perc_identity='99'): hits_collection = kv.get_collection('hits') group_hits = None if not os.path.isdir('circos/links/'): os.makedirs('circos/links/') out_name = 'circos/links/all_links_{}.txt'.format(perc_identity) if group: groups = core_hgt_groups() group_hits = sorted(groups, key=len, reverse=True)[group - 1] out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity) with open(out_name, 'w+') as out_handle: for species in hits_collection.find(): print species try: all_hits = species['core_hits_{}'.format(perc_identity)] hits_to_write = None if group: hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits} else: hits_to_write = all_hits for gene in hits_to_write: if hits_to_write[gene]: s1_record = kv.get_mongo_record(species['species'], gene) s1_strain = kv.parse_species_name(species['species']) for hit in hits_to_write[gene]: s2_record = kv.get_mongo_record(hit[0], hit[1]) s2_strain = kv.parse_species_name(hit[0]) out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format( s1_strain[2], s1_record['location']['contig'], s1_record['location']['start'], s1_record['location']['end'], s2_strain[2], s2_record['location']['contig'], s2_record['location']['start'], s2_record['location']['end'], ) ) except KeyError: pass
def all_by_all(species_1, species_2): # results = fasta_blast(species_1, species_2) results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2) if results: with open(results, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) hits_list = [] for blast_record in blast_records: qsp, qid = kv.fasta_id_parse(blast_record.query) query_record = kv.get_mongo_record(qsp, qid) for alignment in blast_record.alignments: asp, aid = kv.fasta_id_parse(alignment.hit_def) alignment_record = kv.get_mongo_record(asp, aid) for hsp in alignment.hsps: if hsp.align_length > 100: pident = float(hsp.positives)/float(hsp.align_length) length = hsp.align_length hits_list.append((query_record, alignment_record)) break break return hits_list else: print "Blast didn't work for some reason"
def pair_compare(species_1, species_2): shared_CDS = 0 shared_nt = 0 s1_genes = kv.get_collection('hits').find_one({'species':species_1}) for gene in s1_genes['hits']: if s1_genes['hits'][gene]: for hit in s1_genes['hits'][gene]: if hit[0] == species_2: shared_CDS += 1 species_2_record = kv.get_mongo_record(hit[0],hit[1]) hit_loc = species_2_record['location'] shared_nt += hit_loc['end'] - hit_loc['start'] return shared_CDS, shared_nt
def output_groups(output_file='default', min_group_size=2): if output_file == 'default': output_file = 'groups.csv'.format(kv.db.name) df_index = [ 'groups', 'species', 'kvtag', 'contig', 'start', 'end', 'strand', 'annotation', 'dna_seq', ] df = pd.DataFrame() group_no= 0 groups_list = get_groups() groups_list.sort(key=len, reverse=True) for group in groups_list: if len(group) >= min_group_size: group_no += 1 # Entry is `(species, id)` for entry in group: db_handle = kv.get_mongo_record(*entry) annotation = db_handle['annotation'].replace(',','') series = pd.Series( [str(group_no).zfill(3), db_handle['species'], db_handle['kvtag'], db_handle['location']['contig'], db_handle['location']['start'], db_handle['location']['end'], db_handle['location']['strand'], annotation, db_handle['dna_seq'] ], index=df_index, name=db_handle['kvtag'] ) df=df.append(series) df.to_csv(output_file, columns=df_index)
def blast_to_db(db='core', perc_identity='99'): blast_dir = 'blast_results/{}/'.format(db) for f in os.listdir(blast_dir): if f.endswith('{}_blast.xml'.format(perc_identity)): file_handle = 'blast_results/{}/{}'.format(db,f) with open(file_handle, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) hits_dict = {} for blast_record in blast_records: query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query) query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query) query_genus = query_genus_parse.group(1) query_name = query_parse.group(1) query_id = query_parse.group(2) hits_dict[query_id] = [] for alignment in blast_record.alignments: hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def) hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def) hit_genus = hit_genus_parse.group(1) hit_name = hit_parse.group(1) hit_id = hit_parse.group(2) if query_name == hit_name: pass elif query_genus == hit_genus: print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name) pass elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S': print 'Skipping 16S hit' else: print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name) hits_dict[query_id].append((hit_name, hit_id)) print 'Updataing mongoDB with hits' hits_collection = kv.get_collection('hits') hits_collection.update_one( {'species':query_name}, {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}}, upsert=True )
def get_islands(species_name, perc_identity='99'): """ For each species, combines HGT hits co-occurring within 5kb of eachother Returns list of lists of `(species, _id)` tuples """ islands = [] species_hits_list = [] # Add mongo_record for each hit in any gene all_hits = kv.get_collection('hits') species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)] for query_id in species_hits: if species_hits[query_id]: species_hits_list.append( kv.get_mongo_record(species_name, query_id) ) for entry_1 in species_hits_list: entry_recorded = False for entry_2 in species_hits_list: if entry_1 == entry_2: pass elif entry_1['location']['contig'] != entry_2['location']['contig']: pass else: location_1 = entry_1['location'] location_2 = entry_2['location'] if abs(location_1['end'] - location_2['start']) <= 5000: entry_recorded = True islands.append([ (entry_1['species'], str(entry_1['_id'])), (entry_2['species'], str(entry_2[ '_id'])) ]) if not entry_recorded: islands.append([(entry_1['species'], str(entry_1['_id']))]) return collapse_lists(islands)
def group_hits(core=False): all_species = kv.get_collection('core').distinct('species') if not core: all_species.extend(kv.get_collection('other').distinct('species')) hits_db = kv.get_collection('hits') species_index = sorted(all_species) print species_index df = pd.DataFrame() core_groups = sorted(core_hgt_groups(), key=len, reverse=True) for group in sorted(hits_db.distinct('group')): recorded = [] s = {sp:0.0 for sp in species_index} for hit in core_groups[group-1]: if not hit in recorded: s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq']) recorded.append(hit) for hit in hits_db.find_one({'group':group})['group_hits']: if float(hit[2]) > 90 and float(hit[3]) > 100: if hit[1] not in recorded: s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100 recorded.append(hit[1]) s = pd.Series(s, name='group_{}'.format(group)) df['group_{}'.format(group)] = s df.to_csv('group_hits_other.csv') # if __name__ == '__main__': # import os # kv.mongo_init('pacbio2') # os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') # # group_hits(core=True) # # output_groups() # # core_hgt_stats() # output_hits_csv()
def output_groups(min_group_size=2): """ Returns .csv file with information for each CDS in an HGT group - Optional: set minimum number of CDS to be considered a group """ output_file = '{}_groups.csv'.format(kv.db.name) df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq'] df = pd.DataFrame() group_no= 0 groups_list = core_hgt_groups() groups_list.sort(key=len, reverse=True) for group in groups_list: if len(group) >= min_group_size: group.sort(key=lambda entry:entry[0]) group_no += 1 for entry in group: # Entry is `(species, id)` db_handle = kv.get_mongo_record(*entry) annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up series = pd.Series( [str(group_no).zfill(3), db_handle['kvtag'], db_handle['location']['contig'], db_handle['location']['start'], db_handle['location']['end'], db_handle['location']['strand'], annotation, db_handle['dna_seq'] ], index=df_index, name=db_handle['species'] ) df=df.append(series) df.to_csv(output_file, columns=df_index)
'-out', 'pairwise_blast/{}_blastdb'.format(species_2), '-title', os.path.basename(species_2), ] ).wait() indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2)) concatenated_subject = kv.concat_contigs(kv.get_collection(species_1)) xys = [] last_end = 0 for i in range(len(indexed_blast))[0::4]: # print indexed_blast[i:i+4] subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])] query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1])) x1 = subject['location']['start'] if x1 <= last_end: x1 = last_end + 1 x2 = subject['location']['end'] last_end = x2 y = float(indexed_blast[i+2]) print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)] xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]) xys.sort(key=lambda xy:xy[0]) x, y = zip(*xys)