def all_by_all(species_1, species_2):
    # results = fasta_blast(species_1, species_2)
    results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2)
    if results:
        with open(results, 'r') as result_handle:
            blast_records = NCBIXML.parse(result_handle)
            hits_list = []
            for blast_record in blast_records:
                qsp, qid = kv.fasta_id_parse(blast_record.query)
                query_record = kv.get_mongo_record(qsp, qid)
                for alignment in blast_record.alignments:
                    asp, aid = kv.fasta_id_parse(alignment.hit_def)
                    alignment_record = kv.get_mongo_record(asp, aid)
                    for hsp in alignment.hsps:
                        if hsp.align_length > 100:
                            pident = float(hsp.positives)/float(hsp.align_length)
                            length = hsp.align_length
                            hits_list.append((query_record, alignment_record))
                        break
                    break
            return hits_list
    else:
        print "Blast didn't work for some reason"
Beispiel #2
0
def group_hits(core=False):
    all_species = kv.get_collection('core').distinct('species')
    if not core:
        all_species.extend(kv.get_collection('other').distinct('species'))
    

    hits_db = kv.get_collection('hits')
    species_index = sorted(all_species)
    print species_index
    df = pd.DataFrame()
    core_groups = sorted(core_hgt_groups(), key=len, reverse=True)


    for group in sorted(hits_db.distinct('group')):
        recorded = []
        s = {sp:0.0 for sp in species_index}
        for hit in core_groups[group-1]:
            if not hit in recorded:
                s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq'])
                recorded.append(hit)
        
        for hit in hits_db.find_one({'group':group})['group_hits']:
            if float(hit[2]) > 90 and float(hit[3]) > 100:
                if hit[1] not in recorded:
                    s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100
                    recorded.append(hit[1])
                
        s = pd.Series(s, name='group_{}'.format(group))
        df['group_{}'.format(group)] = s

    df.to_csv('group_hits_other.csv')

# if __name__ == '__main__':
#     import os
#     kv.mongo_init('pacbio2')
#     os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
#     # group_hits(core=True)
#     # output_groups()
#     # core_hgt_stats()
#     output_hits_csv()
                '-dbtype', 'nucl',
                '-out', 'pairwise_blast/{}_blastdb'.format(species_2),
                '-title', os.path.basename(species_2),
            ]
        ).wait()

    indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2))
    concatenated_subject = kv.concat_contigs(kv.get_collection(species_1))
    
    xys = []
    last_end = 0
    
    for i in range(len(indexed_blast))[0::4]:
        # print indexed_blast[i:i+4]

        subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])]
        query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1]))
        
        x1 = subject['location']['start']
        if x1 <= last_end:
            x1 = last_end + 1

        x2 = subject['location']['end']
        last_end = x2

        y = float(indexed_blast[i+2])
        print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]
        xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)])

    xys.sort(key=lambda xy:xy[0])
    x, y = zip(*xys)