def get_islands(species_name):
    islands = []
    species_hits_list = []
    
    # Add mongo_record for each hit in any gene
    all_hits = kv.get_collection('hits')

    
    for query_id in species_hits:
        if species_hits[query_id]:
            species_hits_list.append(
                kv.get_mongo_record(species_name, query_id)
                )

    for entry_1 in species_hits_list:
        entry_recorded = False
        for entry_2 in species_hits_list:
            if entry_1 == entry_2:
                pass
            elif entry_1['location']['contig'] != entry_2['location']['contig']:
                pass
            else:
                location_1 = entry_1['location']
                location_2 = entry_2['location']
                if abs(location_1['end'] - location_2['start']) <= 5000:
                    entry_recorded = True
                    islands.append([
                        (entry_1['species'], str(entry_1['_id'])),
                        (entry_2['species'], str(entry_2['_id']))
                    ])
        if not entry_recorded:
            islands.append([(entry_1['species'], str(entry_1['_id']))])

    return collapse_lists(islands)
Example #2
0
def get_links(group=None, perc_identity='99'):
    hits_collection = kv.get_collection('hits')
    group_hits = None
    if not os.path.isdir('circos/links/'):
            os.makedirs('circos/links/')
    out_name = 'circos/links/all_links_{}.txt'.format(perc_identity)
    if group:
        groups = core_hgt_groups()
        group_hits = sorted(groups, key=len, reverse=True)[group - 1]
        out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity)
    
    with open(out_name, 'w+') as out_handle:
        for species in hits_collection.find():
            print species
            try:
                all_hits = species['core_hits_{}'.format(perc_identity)]
                hits_to_write = None
                if group:
                    hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits}
                else:
                    hits_to_write = all_hits
                for gene in hits_to_write:
                    if hits_to_write[gene]:
                        s1_record = kv.get_mongo_record(species['species'], gene)
                        s1_strain = kv.parse_species_name(species['species'])
                        for hit in hits_to_write[gene]:
                            s2_record = kv.get_mongo_record(hit[0], hit[1])
                            s2_strain = kv.parse_species_name(hit[0])
                            out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format(
                                s1_strain[2],
                                s1_record['location']['contig'],
                                s1_record['location']['start'],
                                s1_record['location']['end'],
                                s2_strain[2],
                                s2_record['location']['contig'],
                                s2_record['location']['start'],
                                s2_record['location']['end'],
                                )
                            )        
            except KeyError:
                pass
Example #3
0
def all_by_all(species_1, species_2):
    # results = fasta_blast(species_1, species_2)
    results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2)
    if results:
        with open(results, 'r') as result_handle:
            blast_records = NCBIXML.parse(result_handle)
            hits_list = []
            for blast_record in blast_records:
                qsp, qid = kv.fasta_id_parse(blast_record.query)
                query_record = kv.get_mongo_record(qsp, qid)
                for alignment in blast_record.alignments:
                    asp, aid = kv.fasta_id_parse(alignment.hit_def)
                    alignment_record = kv.get_mongo_record(asp, aid)
                    for hsp in alignment.hsps:
                        if hsp.align_length > 100:
                            pident = float(hsp.positives)/float(hsp.align_length)
                            length = hsp.align_length
                            hits_list.append((query_record, alignment_record))
                        break
                    break
            return hits_list
    else:
        print "Blast didn't work for some reason"
def pair_compare(species_1, species_2):    
    shared_CDS = 0
    shared_nt = 0

    s1_genes = kv.get_collection('hits').find_one({'species':species_1})
    
    for gene in s1_genes['hits']:
        if s1_genes['hits'][gene]:
            for hit in s1_genes['hits'][gene]:
                if hit[0] == species_2:
                    shared_CDS += 1
                    species_2_record = kv.get_mongo_record(hit[0],hit[1])
                    hit_loc = species_2_record['location']
                    shared_nt += hit_loc['end'] - hit_loc['start']
    return shared_CDS, shared_nt
def output_groups(output_file='default', min_group_size=2):
    if output_file == 'default':
        output_file = 'groups.csv'.format(kv.db.name)
        df_index = [
            'groups',
            'species',
            'kvtag',
            'contig',
            'start',
            'end',
            'strand',
            'annotation',
            'dna_seq',
        ]
        df = pd.DataFrame()
        group_no= 0
        groups_list = get_groups()
        groups_list.sort(key=len, reverse=True)

        for group in groups_list:
            if len(group) >= min_group_size:
                group_no += 1
                # Entry is `(species, id)`
                for entry in group:
                    db_handle = kv.get_mongo_record(*entry)
                    
                    annotation = db_handle['annotation'].replace(',','')
                    series = pd.Series(
                        [str(group_no).zfill(3),
                        db_handle['species'],
                        db_handle['kvtag'],
                        db_handle['location']['contig'],
                        db_handle['location']['start'],
                        db_handle['location']['end'],
                        db_handle['location']['strand'],
                        annotation,
                        db_handle['dna_seq']
                        ],
                        index=df_index,
                        name=db_handle['kvtag']
                    )
                    df=df.append(series)
        df.to_csv(output_file, columns=df_index)
Example #6
0
def blast_to_db(db='core', perc_identity='99'):
    blast_dir = 'blast_results/{}/'.format(db)
    for f in os.listdir(blast_dir):
        if f.endswith('{}_blast.xml'.format(perc_identity)):
            file_handle = 'blast_results/{}/{}'.format(db,f)
            with open(file_handle, 'r') as result_handle:
                blast_records = NCBIXML.parse(result_handle)
                hits_dict = {}
                for blast_record in blast_records:
                    query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query)
                    query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query)
                    query_genus = query_genus_parse.group(1)
                    query_name = query_parse.group(1)
                    query_id = query_parse.group(2)

                    hits_dict[query_id] = []

                    for alignment in blast_record.alignments:
                        hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def)
                        hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def)
                        hit_genus = hit_genus_parse.group(1)

                        hit_name = hit_parse.group(1)
                        hit_id = hit_parse.group(2)
                        if query_name == hit_name:
                            pass
                        elif query_genus == hit_genus:
                            print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name)
                            pass
                        elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S':
                            print 'Skipping 16S hit'
                        else:
                            print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name)
                            hits_dict[query_id].append((hit_name, hit_id))
                    
                print 'Updataing mongoDB with hits'
                hits_collection = kv.get_collection('hits')
                hits_collection.update_one(
                    {'species':query_name},
                    {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}},
                    upsert=True
                    ) 
Example #7
0
def get_islands(species_name, perc_identity='99'):
    """
    For each species, combines HGT hits co-occurring within 5kb of eachother
    Returns list of lists of `(species, _id)` tuples
    """
        
    islands = []
    species_hits_list = []
    
    # Add mongo_record for each hit in any gene
    all_hits = kv.get_collection('hits')
    species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)]

    
    for query_id in species_hits:
        if species_hits[query_id]:
            species_hits_list.append(
                kv.get_mongo_record(species_name, query_id)
                )

    for entry_1 in species_hits_list:
        entry_recorded = False
        for entry_2 in species_hits_list:
            if entry_1 == entry_2:
                pass
            elif entry_1['location']['contig'] != entry_2['location']['contig']:
                pass
            else:
                location_1 = entry_1['location']
                location_2 = entry_2['location']
                if abs(location_1['end'] - location_2['start']) <= 5000:
                    entry_recorded = True
                    islands.append([
                        (entry_1['species'], str(entry_1['_id'])),
                        (entry_2['species'], str(entry_2[ '_id']))
                    ])
        if not entry_recorded:
            islands.append([(entry_1['species'], str(entry_1['_id']))])

    return collapse_lists(islands)
Example #8
0
def group_hits(core=False):
    all_species = kv.get_collection('core').distinct('species')
    if not core:
        all_species.extend(kv.get_collection('other').distinct('species'))
    

    hits_db = kv.get_collection('hits')
    species_index = sorted(all_species)
    print species_index
    df = pd.DataFrame()
    core_groups = sorted(core_hgt_groups(), key=len, reverse=True)


    for group in sorted(hits_db.distinct('group')):
        recorded = []
        s = {sp:0.0 for sp in species_index}
        for hit in core_groups[group-1]:
            if not hit in recorded:
                s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq'])
                recorded.append(hit)
        
        for hit in hits_db.find_one({'group':group})['group_hits']:
            if float(hit[2]) > 90 and float(hit[3]) > 100:
                if hit[1] not in recorded:
                    s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100
                    recorded.append(hit[1])
                
        s = pd.Series(s, name='group_{}'.format(group))
        df['group_{}'.format(group)] = s

    df.to_csv('group_hits_other.csv')

# if __name__ == '__main__':
#     import os
#     kv.mongo_init('pacbio2')
#     os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
#     # group_hits(core=True)
#     # output_groups()
#     # core_hgt_stats()
#     output_hits_csv()
Example #9
0
def output_groups(min_group_size=2):
    """
    Returns .csv file with information for each CDS in an HGT group

    - Optional: set minimum number of CDS to be considered a group
    """ 
    output_file = '{}_groups.csv'.format(kv.db.name)
    df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq']
    df = pd.DataFrame()
    group_no= 0
    groups_list = core_hgt_groups()
    groups_list.sort(key=len, reverse=True)

    for group in groups_list:
        if len(group) >= min_group_size:
            group.sort(key=lambda entry:entry[0])
            group_no += 1
            for entry in group: # Entry is `(species, id)`
                
                db_handle = kv.get_mongo_record(*entry)
                annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up
                series = pd.Series(
                    [str(group_no).zfill(3),
                    db_handle['kvtag'],
                    db_handle['location']['contig'],
                    db_handle['location']['start'],
                    db_handle['location']['end'],
                    db_handle['location']['strand'],
                    annotation,
                    db_handle['dna_seq']
                    ],
                    index=df_index,
                    name=db_handle['species']
                )
                df=df.append(series)
    df.to_csv(output_file, columns=df_index)
Example #10
0
                '-out', 'pairwise_blast/{}_blastdb'.format(species_2),
                '-title', os.path.basename(species_2),
            ]
        ).wait()

    indexed_blast = blast_one(s1, 'pairwise_blast/{}_blastdb'.format(species_2))
    concatenated_subject = kv.concat_contigs(kv.get_collection(species_1))
    
    xys = []
    last_end = 0
    
    for i in range(len(indexed_blast))[0::4]:
        # print indexed_blast[i:i+4]

        subject = concatenated_subject[ObjectId(kv.fasta_id_parse(indexed_blast[i])[1])]
        query = kv.get_mongo_record(*kv.fasta_id_parse(indexed_blast[i+1]))
        
        x1 = subject['location']['start']
        if x1 <= last_end:
            x1 = last_end + 1

        x2 = subject['location']['end']
        last_end = x2

        y = float(indexed_blast[i+2])
        print [(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)]
        xys.extend([(x1-0.1, 0), (x1, y), (x2, y), (x2+0.1, 0)])

    xys.sort(key=lambda xy:xy[0])
    x, y = zip(*xys)