def all_by_all(species_1, species_2): # results = fasta_blast(species_1, species_2) results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2) if results: with open(results, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) hits_list = [] for blast_record in blast_records: qsp, qid = kv.fasta_id_parse(blast_record.query) query_record = kv.get_mongo_record(qsp, qid) for alignment in blast_record.alignments: asp, aid = kv.fasta_id_parse(alignment.hit_def) alignment_record = kv.get_mongo_record(asp, aid) for hsp in alignment.hsps: if hsp.align_length > 100: pident = float(hsp.positives)/float(hsp.align_length) length = hsp.align_length hits_list.append((query_record, alignment_record)) break break return hits_list else: print "Blast didn't work for some reason"
def group_hits(core=False): all_species = kv.get_collection('core').distinct('species') if not core: all_species.extend(kv.get_collection('other').distinct('species')) hits_db = kv.get_collection('hits') species_index = sorted(all_species) print species_index df = pd.DataFrame() core_groups = sorted(core_hgt_groups(), key=len, reverse=True) for group in sorted(hits_db.distinct('group')): recorded = [] s = {sp:0.0 for sp in species_index} for hit in core_groups[group-1]: if not hit in recorded: s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq']) recorded.append(hit) for hit in hits_db.find_one({'group':group})['group_hits']: if float(hit[2]) > 90 and float(hit[3]) > 100: if hit[1] not in recorded: s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100 recorded.append(hit[1]) s = pd.Series(s, name='group_{}'.format(group)) df['group_{}'.format(group)] = s df.to_csv('group_hits_other.csv') # if __name__ == '__main__': # import os # kv.mongo_init('pacbio2') # os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') # # group_hits(core=True) # # output_groups() # # core_hgt_stats() # output_hits_csv()
'-dbtype', 'nucl', '-out', 'pairwise_blast/{}_blastdb'.format(species_2), '-title', os.path.basename(species_2), ] ).wait() indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2)) concatenated_subject = kv.concat_contigs(kv.get_collection(species_1)) xys = [] last_end = 0 for i in range(len(indexed_blast))[0::4]: # print indexed_blast[i:i+4] subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])] query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1])) x1 = subject['location']['start'] if x1 <= last_end: x1 = last_end + 1 x2 = subject['location']['end'] last_end = x2 y = float(indexed_blast[i+2]) print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)] xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]) xys.sort(key=lambda xy:xy[0]) x, y = zip(*xys)