def get_islands(species_name): islands = [] species_hits_list = [] # Add mongo_record for each hit in any gene all_hits = kv.get_collection('hits') for query_id in species_hits: if species_hits[query_id]: species_hits_list.append( kv.get_mongo_record(species_name, query_id) ) for entry_1 in species_hits_list: entry_recorded = False for entry_2 in species_hits_list: if entry_1 == entry_2: pass elif entry_1['location']['contig'] != entry_2['location']['contig']: pass else: location_1 = entry_1['location'] location_2 = entry_2['location'] if abs(location_1['end'] - location_2['start']) <= 5000: entry_recorded = True islands.append([ (entry_1['species'], str(entry_1['_id'])), (entry_2['species'], str(entry_2['_id'])) ]) if not entry_recorded: islands.append([(entry_1['species'], str(entry_1['_id']))]) return collapse_lists(islands)
def output_compare_matrix(): groups = get_groups() all_species = get_tree()[1] print all_species cds_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) nt_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species) for pair in combinations(all_species, 2): print "====\nComparing {} and {}".format(pair[0], pair[1]) shared_cds, shared_nt = pair_compare(pair[0],pair[1]) shared_groups = 0 if kv.get_genus(pair[0]) == kv.get_genus(pair[1]): print "Oops! They're the same genus... moving on\n====" continue elif shared_cds == 0: print "Oops! Looks like they don't share anything... moving on\n====" continue else: for group in get_groups(): if [x for x in group if x[0] == pair[0] and any(y[0] == pair[1] for y in group)]: shared_groups +=1 cds_df[pair[0]][pair[1]] = cds_df[pair[1]][pair[0]] = shared_cds nt_df[pair[0]][pair[1]] = nt_df[pair[1]][pair[0]] = shared_nt groups_df[pair[0]][pair[1]] = groups_df[pair[1]][pair[0]] = shared_groups print "shared cds: {}\nshared nt: {}\nshared groups: {}\n====".format(shared_cds, shared_nt, shared_groups) print nt_df cds_df.to_csv('cds_matrix.csv'.format(kv.db.name)) nt_df.to_csv('nt_matrix.csv'.format(kv.db.name)) groups_df.to_csv('groups_matrix.csv' .format(kv.db.name))
def core_hgt_blast(perc_identity='99'): """ Blasts all core genomes against core db - Set `perc_identity` if desired (default = 99) """ if not os.path.isdir('blast_results/core/'): os.makedirs('blast_results/core/') for species in kv.get_collection('core').distinct('species'): query_fasta = 'blast_results/core/{}_tmp.fna'.format(species) with open(query_fasta, 'w+') as query_handle: for query in kv.get_collection('core').find({'species':species}): if query['type'] == 'gene': query_handle.write('>{0}|{1}\n{2}\n'.format( query['species'], query['_id'], query['dna_seq'] ) ) print 'Blasting {0}'.format(species) out = Popen( ['blastn', '-query', query_fasta, '-db', 'blast_databases/core', '-outfmt', '5', '-out', 'blast_results/core/{}_{}_blast.xml'.format(species, perc_identity), '-perc_identity', perc_identity ], stdout=PIPE ).communicate()[0] os.remove(query_fasta)
def ssu_fasta(): with open('16s.fna', 'w+') as out_handle: for species in kv.get_collection('16S').distinct('species'): ssu = kv.get_collection('16S').find_one({'species':species}) if ssu: out_handle.write(kv.make_gene_fasta(ssu, to_file=False)) else: print species
def other_blast(): groups_list = core_hgt_groups() groups_list.sort(key=len, reverse=True) for i in range(len(groups_list)): group_hits = [] kv.make_id_list_fasta(groups_list[i], 'core') results = blast_vs_db('tmp.fna', 'blast_databases/other') hits_collection = kv.get_collection('hits') if results: for j in range(len(results)): group_hits.append(results[j]) hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits})
def make_indexed_fasta(species): if not os.path.isdir('fastas/'): os.makedirs('fastas/') id_list =[] indexed_species = kv.index_contigs(kv.get_collection(species)) indexed_species2 = kv.index_contigs(kv.get_collection(species)) fasta = 'fastas/{}_indexed.fna'.format(species) for record in indexed_species: id_list.append(record['_id']) if not os.path.isfile(fasta): with open(fasta, 'w+') as output_handle: for record in indexed_species2: output_handle.write( ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq']) )
def pair_compare(species_1, species_2): shared_CDS = 0 shared_nt = 0 s1_genes = kv.get_collection('hits').find_one({'species':species_1}) for gene in s1_genes['hits']: if s1_genes['hits'][gene]: for hit in s1_genes['hits'][gene]: if hit[0] == species_2: shared_CDS += 1 species_2_record = kv.get_mongo_record(hit[0],hit[1]) hit_loc = species_2_record['location'] shared_nt += hit_loc['end'] - hit_loc['start'] return shared_CDS, shared_nt
def get_groups(): all_hits = kv.get_collection('hits') groups_list = [] for h in all_hits.find(): current_species = h['species'] # if any([x in current_species for x in dutton_list] or [y in current_species for y in wolfe_list]): current_species_islands = get_islands(h['species']) # each sublist represents one island... for island in current_species_islands: hit_set = set() # container for hits for gene_id in island: gene_hits = h['hits'][gene_id[1]] # Pulls each hit id tuple, then appends it to group_set for hit in gene_hits: hit_set.add((hit[0], hit[1])) # add id tuples for hits to island list... island.update(hit_set) # And add new island (with multiple species) to groups_list groups_list.append(list(island)) # Since each species' islands are built independently, there's a lot of redundancy # So... Collapse lists that contain shared elements and deduplicate return map(list, collapse_lists(groups_list))
def get_gc(some_gbk): """ Get GC content for contigs in genbank file, output file for circs line plot """ with open (some_gbk, 'r') as file_handle: gc_points = [] sp_name = None for record in SeqIO.parse(file_handle, 'gb'): for b in range(len(record))[500::1000]: gc_cont = SeqUtils.GC(record.seq[b-500:b+499]) gc_points.append((record.name, b-500, b+499, gc_cont)) sp_name = kv.parse_genbank_name(some_gbk) sp_strain = sp_name[2] with open('circos/GC/gc_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as out_handle: values = [x[3] for x in gc_points] stats = (min(values), max(values), np.average(values), np.std(values)) out_handle.write("# Min: {}\n# Max: {}\n# Avg, Std: {}, {}\n".format( stats[0], stats[1], stats[2], stats[3] ) ) for point in gc_points: out_handle.write('{0}{1} {2} {3} {4}\n'.format( sp_strain, point[0], point[1], point[2], point[3] ) ) return stats
def core_hgt_groups(perc_identity='99'): """ Returns mutilspecies groups of genes as list of lists. - Starts with species islands `get_islands()` - For each island, if a hit from that island is in another island, group them together """ all_hits = kv.get_collection('hits') groups_list = [] for s in all_hits.distinct('species'): s_hits = all_hits.find_one({'species':s}) current_species_islands = get_islands(s_hits['species']) # each sublist represents one island... for island in current_species_islands: if island: # many lists are empty, skip those hit_set = set() # container for hits for gene_id in island: gene_hits = s_hits['core_hits_{}'.format(perc_identity)][gene_id[1]] # Pulls each hit id tuple, then appends it to group_set for hit in gene_hits: hit_set.add((hit[0], hit[1])) # add id tuples for hits to island list... island.update(hit_set) # And add new island (with multiple species) to groups_list groups_list.append(list(island)) # Since each species' islands are built independently, there's a lot of redundancy # So... Collapse lists that contain shared elements and deduplicate return map(list, collapse_lists(groups_list))
def get_karyotype(some_gbk): """ Convert Genbank file into Karyotype file for Circos - Each contig is a "chromosome" - format: 'chr - ID LABEL START END COLOR' """ with open (some_gbk, 'r') as file_handle: contigs = [] sp_name = None for record in SeqIO.parse(file_handle, 'gb'): sp_name = kv.parse_genbank_name(some_gbk) contigs.append((record.name, len(record))) sp_strain = sp_name[2] if not os.path.isdir('circos/karyotypes/'): os.makedirs('circos/karyotypes/') with open('circos/karyotypes/karyotype_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as karyotype: color = [np.random.randint(0,255), np.random.randint(0,255), np.random.randint(0,255)] for contig in contigs: if contig[1] > 1000: karyotype.write('chr - {0}{1} {2} {3} {4} {5},{6},{7}\n'.format( sp_strain, contig[0], contig[0], '1', contig[1], *color ) ) else: break
def core_hgt_stats(perc_identity='99'): """ Returns stats of HGT (number of events etc) """ collection = kv.get_collection('core') df_index = ['Total_CDS', 'HGT_CDS', 'Islands'] df = pd.DataFrame() for species in collection.distinct('species'): hits = kv.get_collection('hits').find_one({'species':species})['core_hits_{}'.format(perc_identity)] series = pd.Series([ sum([1 for x in collection.find({'species':species})]), sum([1 for x in hits if hits[x]]), len(get_islands(species)) ], name=species, index=df_index) df = df.append(series) df.to_csv('stats.csv', columns=df_index)
def get_tree(core=False, newick=False): all_species = kv.get_collection('core').distinct('species') if core: pass else: all_species.extend(kv.get_collection('other').distinct('species')) t = tree.nj(dm) print t.ascii_art() tips = [] for node in t.tips(): print node.name, node.length tips.append(node.name.replace(' ', '_')) if newick: n = tree.nj(dm, result_constructor=str) print n else: return (t, tips)
def output_distance_matrix(core=False, to_file=True): all_species = kv.get_collection('core').distinct('species')) if core: pass else: all_species.extend(kv.get_collection('other').distinct('species')) distance_matrix = pd.DataFrame(data={n:0.0 for n in all_species}, index=all_species) for pair in combinations_with_replacement(all_species, 2): distance = get_16S_distance(pair[0], pair[1]) distance_matrix[pair[0]][pair[1]] = distance distance_matrix[pair[1]][pair[0]] = distance if to_file: distance_matrix.to_csv('distance_matrix.csv') else: return distance_matrix
def make_species_fasta(species): if not os.path.isdir('fastas'): os.makedirs('fastas') fasta = 'fastas/{}.fna'.format(species) if not os.path.isfile(fasta): with open(fasta, 'w+') as output_handle: for record in kv.get_collection(species).find(): output_handle.write( ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq']) )
def output_all_16S(): print "Making fasta of all 16S in database {}".format(kv.db.name) with open('{}_16S.fna'.format(kv.db.name), 'w+') as output_handle: for record in kv.get_collection('16S').find(): output_handle.write( '>{0}\n{1}\n'.format( record['species'], record['dna_seq'], ) )
def blast_to_db(db='core', perc_identity='99'): blast_dir = 'blast_results/{}/'.format(db) for f in os.listdir(blast_dir): if f.endswith('{}_blast.xml'.format(perc_identity)): file_handle = 'blast_results/{}/{}'.format(db,f) with open(file_handle, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) hits_dict = {} for blast_record in blast_records: query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query) query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query) query_genus = query_genus_parse.group(1) query_name = query_parse.group(1) query_id = query_parse.group(2) hits_dict[query_id] = [] for alignment in blast_record.alignments: hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def) hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def) hit_genus = hit_genus_parse.group(1) hit_name = hit_parse.group(1) hit_id = hit_parse.group(2) if query_name == hit_name: pass elif query_genus == hit_genus: print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name) pass elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S': print 'Skipping 16S hit' else: print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name) hits_dict[query_id].append((hit_name, hit_id)) print 'Updataing mongoDB with hits' hits_collection = kv.get_collection('hits') hits_collection.update_one( {'species':query_name}, {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}}, upsert=True )
def get_distance_matrix(core=False, to_file=True): all_species = kv.get_collection('core').distinct('species') if core: pass else: all_species.extend(kv.get_collection('other').distinct('species')) ssu_species = [n for n in all_species if kv.db['16S'].find_one({'species':n})] distance_matrix = pd.DataFrame(data={n:0.0 for n in ssu_species}, index=ssu_species, columns=ssu_species) for pair in combinations_with_replacement(ssu_species, 2): distance = get_16S_distance(pair[0], pair[1]) if distance: distance_matrix[pair[0]][pair[1]] = distance distance_matrix[pair[1]][pair[0]] = distance if to_file: distance_matrix.to_csv('distance_matrix.csv') else: return distance_matrix
def get_links(group=None, perc_identity='99'): hits_collection = kv.get_collection('hits') group_hits = None if not os.path.isdir('circos/links/'): os.makedirs('circos/links/') out_name = 'circos/links/all_links_{}.txt'.format(perc_identity) if group: groups = core_hgt_groups() group_hits = sorted(groups, key=len, reverse=True)[group - 1] out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity) with open(out_name, 'w+') as out_handle: for species in hits_collection.find(): print species try: all_hits = species['core_hits_{}'.format(perc_identity)] hits_to_write = None if group: hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits} else: hits_to_write = all_hits for gene in hits_to_write: if hits_to_write[gene]: s1_record = kv.get_mongo_record(species['species'], gene) s1_strain = kv.parse_species_name(species['species']) for hit in hits_to_write[gene]: s2_record = kv.get_mongo_record(hit[0], hit[1]) s2_strain = kv.parse_species_name(hit[0]) out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format( s1_strain[2], s1_record['location']['contig'], s1_record['location']['start'], s1_record['location']['end'], s2_strain[2], s2_record['location']['contig'], s2_record['location']['start'], s2_record['location']['end'], ) ) except KeyError: pass
def get_islands(species_name, perc_identity='99'): """ For each species, combines HGT hits co-occurring within 5kb of eachother Returns list of lists of `(species, _id)` tuples """ islands = [] species_hits_list = [] # Add mongo_record for each hit in any gene all_hits = kv.get_collection('hits') species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)] for query_id in species_hits: if species_hits[query_id]: species_hits_list.append( kv.get_mongo_record(species_name, query_id) ) for entry_1 in species_hits_list: entry_recorded = False for entry_2 in species_hits_list: if entry_1 == entry_2: pass elif entry_1['location']['contig'] != entry_2['location']['contig']: pass else: location_1 = entry_1['location'] location_2 = entry_2['location'] if abs(location_1['end'] - location_2['start']) <= 5000: entry_recorded = True islands.append([ (entry_1['species'], str(entry_1['_id'])), (entry_2['species'], str(entry_2[ '_id'])) ]) if not entry_recorded: islands.append([(entry_1['species'], str(entry_1['_id']))]) return collapse_lists(islands)
def group_hits(core=False): all_species = kv.get_collection('core').distinct('species') if not core: all_species.extend(kv.get_collection('other').distinct('species')) hits_db = kv.get_collection('hits') species_index = sorted(all_species) print species_index df = pd.DataFrame() core_groups = sorted(core_hgt_groups(), key=len, reverse=True) for group in sorted(hits_db.distinct('group')): recorded = [] s = {sp:0.0 for sp in species_index} for hit in core_groups[group-1]: if not hit in recorded: s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq']) recorded.append(hit) for hit in hits_db.find_one({'group':group})['group_hits']: if float(hit[2]) > 90 and float(hit[3]) > 100: if hit[1] not in recorded: s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100 recorded.append(hit[1]) s = pd.Series(s, name='group_{}'.format(group)) df['group_{}'.format(group)] = s df.to_csv('group_hits_other.csv') # if __name__ == '__main__': # import os # kv.mongo_init('pacbio2') # os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') # # group_hits(core=True) # # output_groups() # # core_hgt_stats() # output_hits_csv()
def get_tree(core=False, newick=False): core_collection = kv.get_collection('core') all_species = core_collection.distinct('species') if core: pass else: other_collection = kv.get_collection('other') all_species.extend(other_collection.distinct('species')) ssu_species = [n for n in all_species if kv.db['16S'].find_one({'species':n})] dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species) t = tree.nj(dm) print t.ascii_art() tips = [] for node in t.tips(): print node.name, node.length tips.append(node.name.replace(' ', '_')) if newick: n = tree.nj(dm, result_constructor=str) print n else: return (t, tips)
def all_by_all(species_1, species_2): # results = fasta_blast(species_1, species_2) results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2) if results: with open(results, 'r') as result_handle: blast_records = NCBIXML.parse(result_handle) hits_list = [] for blast_record in blast_records: qsp, qid = kv.fasta_id_parse(blast_record.query) query_record = kv.get_mongo_record(qsp, qid) for alignment in blast_record.alignments: asp, aid = kv.fasta_id_parse(alignment.hit_def) alignment_record = kv.get_mongo_record(asp, aid) for hsp in alignment.hsps: if hsp.align_length > 100: pident = float(hsp.positives)/float(hsp.align_length) length = hsp.align_length hits_list.append((query_record, alignment_record)) break break return hits_list else: print "Blast didn't work for some reason"
def output_groups_by_species(min_group_size=2): all_species = kv.get_species_collections() groups_list = get_groups() groups_list.sort(key=len, reverse=True) groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=[str(x+1) for x in range(0, len(groups_list))]) group_no = 0 for group in groups_list: if len(group) >= min_group_size: group_no += 1 species_in_group = [x[0] for x in group] for species in species_in_group: groups_df[species][group_no-1] = 1 groups_df.to_csv('groups_by_species.csv')
def output_groups(output_file='default', min_group_size=2): if output_file == 'default': output_file = 'groups.csv'.format(kv.db.name) df_index = [ 'groups', 'species', 'kvtag', 'contig', 'start', 'end', 'strand', 'annotation', 'dna_seq', ] df = pd.DataFrame() group_no= 0 groups_list = get_groups() groups_list.sort(key=len, reverse=True) for group in groups_list: if len(group) >= min_group_size: group_no += 1 # Entry is `(species, id)` for entry in group: db_handle = kv.get_mongo_record(*entry) annotation = db_handle['annotation'].replace(',','') series = pd.Series( [str(group_no).zfill(3), db_handle['species'], db_handle['kvtag'], db_handle['location']['contig'], db_handle['location']['start'], db_handle['location']['end'], db_handle['location']['strand'], annotation, db_handle['dna_seq'] ], index=df_index, name=db_handle['kvtag'] ) df=df.append(series) df.to_csv(output_file, columns=df_index)
def add_ssu(supp_file): # df = pd.read_csv(supp_file) # print df.columns # new_df = pd.DataFrame() # # for i in range(len(df['Strain'])): # # print df['Strain'][i].replace(' ', '_').replace('.', '') # strain = pd.Series([df['Strain'][i].replace(' ', '_').replace('.', '') for i in range(len(df['Strain']))], name='strain') # ssus = df['sequences of the 16s rRNA genes'] # ssu = pd.Series([ssus[i].replace(r'\n', '') if not pd.isnull(ssus[i]) else None for i in range(len(ssus))], name='16S') # new_df['strain'] = strain # new_df['16S'] = ssu # new_df.to_csv('ssu.csv') ssu_df = pd.read_csv(supp_file) for i in range(len(ssu_df['strain'])): print ssu_df['strain'][i], ssu_df['16S'][i] if not pd.isnull(ssu_df['16S'][i]): gene_record = { 'species':ssu_df['strain'][i], 'location':{ 'contig':None, 'start':None, 'end':None, 'strand':None, }, 'annotation':'Small subunit ribosomal RNA', 'dna_seq':ssu_df['16S'][i], 'kvtag':None, 'type':'16S' } print "adding 16S gene!" kv.get_collection('16S').remove({'species':ssu_df['strain'][i]}) print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]}) kv.get_collection('16S').insert_one(gene_record) print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]})
def output_groups(min_group_size=2): """ Returns .csv file with information for each CDS in an HGT group - Optional: set minimum number of CDS to be considered a group """ output_file = '{}_groups.csv'.format(kv.db.name) df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq'] df = pd.DataFrame() group_no= 0 groups_list = core_hgt_groups() groups_list.sort(key=len, reverse=True) for group in groups_list: if len(group) >= min_group_size: group.sort(key=lambda entry:entry[0]) group_no += 1 for entry in group: # Entry is `(species, id)` db_handle = kv.get_mongo_record(*entry) annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up series = pd.Series( [str(group_no).zfill(3), db_handle['kvtag'], db_handle['location']['contig'], db_handle['location']['start'], db_handle['location']['end'], db_handle['location']['strand'], annotation, db_handle['dna_seq'] ], index=df_index, name=db_handle['species'] ) df=df.append(series) df.to_csv(output_file, columns=df_index)
for i in range(len(groups_list)): group_hits = [] kv.make_id_list_fasta(groups_list[i], 'core') results = blast_vs_db('tmp.fna', 'blast_databases/other') hits_collection = kv.get_collection('hits') if results: for j in range(len(results)): group_hits.append(results[j]) hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits}) if __name__ == '__main__': import sys kv.mongo_init('pacbio2') os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/') # kv.mongo_init(sys.argv[1]) # os.chdir('output/{}/'.format(sys.argv[1])) make_blast_db('core') make_blast_db('other') hits_reset() hgt_blast(perc_identity='90') hgt_blast(perc_identity='95') hgt_blast(perc_identity='99') blast_to_db(perc_identity='90') blast_to_db(perc_identity='95') blast_to_db(perc_identity='99') other_blast()
def make_blast_db(source, name=None, remove_source=True): """ Produces BLAST database from `source` Optional - provide name (defaults to `source`) Set remove_source=False to keep fasta file (if created) Source types: - fasta file (use path, must end with `.fna`) - Mongo collection (use name of collection) - list of dicts containing at least keys `species`, `_id`, `dna_seq` - Mongo cursor eg. `collection.find({'key':value})` """ # If there's no directory for blast db's, create one if not os.path.isdir('blast_databases/'): os.makedirs('blast_databases/') output_fasta = None if os.path.isfile(source): # Input is fasta file? if source.endswith('.fna'): output_fasta = source if not name: name = os.path.basename(source)[:-4] remove_source = False else: print "Not a valid file type, use .fna" else: output_fasta = '{0}_all.fasta'.format(kv.db.name) genes = None with open(output_fasta, 'w+') as output_handle: if source in kv.get_collections(): genes = kv.get_collection(source).find() if not name: name = source elif type(source) == list: genes = source elif type(source) == Cursor: genes = source for gene in genes: output_handle.write('>{0}|{1}\n{2}\n'.format( gene['species'], gene['_id'], gene['dna_seq'], ) ) while not name: name = str(raw_input("enter name for BLAST database: ")) # calls makeblastdb from shell print "making a database!" Popen( ['makeblastdb', '-in', output_fasta, '-dbtype', 'nucl', '-out', 'blast_databases/{0}'.format(name), '-title', name, ] ).wait() # waits for this operation to terminate before moving on if remove_source: os.remove(output_fasta)
def hits_reset(): kv.remove_collection('hits')