def main(args): print_abfinder_start() db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password) make_directories(args) standards = get_standards(args) print_standards_info(standards) collections = mongodb.get_collections(db, args.collection, prefix=args.collection_prefix) print_collections_info(collections) for collection in collections: indexed = False print_single_collection(collection) if args.remove_padding: print_remove_padding() mongodb.remove_padding(db, collection) seq_files = get_sequences(db, collection, args.temp_dir, args) for standard in standards: print_single_standard(standard) scores = run_jobs(seq_files, standard, args) if args.output_dir: make_figure(standard.id, scores, collection, args) if args.update: if not indexed: mongodb.index(db, collection, 'seq_id') indexed = True update_db(db, standard.id, scores, collection, args) clean_up(seq_files)
def main(args): db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password) print_method(args.method) pairs = get_collection_pairs(db, args) index_collections(db, pairs) prev1 = None scores = {} cscores = {} for pair in pairs: s1, s2 = pair curr1 = s1 if prev1 != curr1: print_collection_info(s1) s1_all_vgenes = get_vgenes(db, s1, args.chain) print_pair_info(s1, s2) s1_vgenes, s2_vgenes = get_vgenes(db, s2, args.chain, prev_data=s1_all_vgenes) logger.info('') logger.info('Calculating similarities...') median, counts, bins, similarities = calculate_similarities(s1_vgenes, s2_vgenes, args) write_output(s1, s2, median, counts, bins, similarities, args) scores = update_scores(s1, s2, median, scores) if args.control_similarity: logger.info('') logger.info('Calculating control similarities...') cmedian, ccounts, cbins, csimilarities = calculate_control_similarities(s1_vgenes, s2_vgenes, args) write_output(s1, s2, cmedian, ccounts, cbins, csimilarities, args) cscores = update_scores(s1, s2, cmedian, cscores) prev1 = s1 print_final_results(scores) print_final_results(cscores, control=True)
def main(args): _print_start_info(args) if args.sleep: countdown(args) for d in [args.output, args.temp_dir]: make_dir(d) if args.consensus and args.germs: germs = parse_germs(args.germs) else: germs = args.germs # check whether JSON files have been passed if args.json is not None and all([args.db is None, args.collection is None]): if os.path.isfile(args.json) and args.json.endswith('.json'): collections = [args.json, ] else: collections = list_files(args.json, extension='json') db = None sample_names = [os.path.basename(c).replace('.json', '') for c in collections] # check whether MINIMAL files have been passed: if args.minimal_input is not None and all([args.db is None, args.collection is None]): if os.path.isfile(args.minimal_input) and args.minimal_input.endswith('.txt'): collections = [args.minimal_input, ] else: collections = list_files(args.minimal_input, extension='txt') db = None sample_names = [os.path.basename(c).replace('.txt', '') for c in collections] # otherwise, get sequences from MongoDB else: db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password) collections = mongodb.get_collections(db, collection=args.collection) sample_names = collections for collection, sample_name in zip(collections, sample_names): collection_start = time.time() print_collection_info(collection, sample_name) if args.non_redundant: seqs = get_seqs(db, collection, args, make_seq_db=False) unique_file = unix_sort_unique(seqs, args) write_nr_output(collection, unique_file, collection_start, args) else: seq_db_path = get_seqs(db, collection, args) initial_clusters = initial_clustering(seq_db_path, args) if args.min_seqs == 1: singletons = [ic for ic in initial_clusters if ic.size == 1] initial_clusters = [ic for ic in initial_clusters if ic.size > 1] logger.info('{} clusters contained only a single sequence. Processing singletons...'.format(len(singletons))) singleton_consentroids = process_singleton_clusters(singletons, seq_db_path, args) logger.info('') else: singleton_consentroids = [] consentroids = process_initial_clusters(initial_clusters, seq_db_path, args) consentroids += singleton_consentroids sequences, sizes = zip(*consentroids) write_output(sample_name, sequences, sizes, collection_start, args) for ic in initial_clusters: ic.cleanup() remove_sqlite_db(args)
def main(args): db = mongodb.get_db(args.db, ip=args.ip, port=args.port, user=args.user, password=args.password) for collection in mongodb.get_collections(db): print_collection_info(collection) seqs = query(db, collection, args.chain) if len(seqs) == 0: continue germline_plot(seqs, 'V', collection, args.output, args.var_plot, args.species, args.chain) if args.chain == 'heavy': germline_plot(seqs, 'D', collection, args.output, args.div_plot, args.species, args.chain) germline_plot(seqs, 'J', collection, args.output, args.join_plot, args.species, args.chain) cdr3_plot(seqs, collection, args.cdr3_plot, args.chain, args.output) vj_heatmap(seqs, collection, args.heatmap, args.species, args.chain, args.output)
def update(db, collection, data, standard, version, args): db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password) coll = db[collection] score = data[0] ids = data[1] mab_id_field = 'mab_identity_aa' if args.is_aa else 'mab_identity_nt' if int(version.split('.')[0]) < 3: result = coll.update({'seq_id': {'$in': ids}}, {'$set': {'{}.{}'.format(mab_id_field, standard.lower()): float(score)}}, multi=True) else: result = coll.update_many({'seq_id': {'$in': ids}}, {'$set': {'{}.{}'.format(mab_id_field, standard.lower()): float(score)}}) if args.debug: print('matched: {}'.format(result.matched_count)) print('modified: {}'.format(result.modified_count))
def update_db(db, standard, scores, collection, args): db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password) print_index_info() mongodb.index(db, collection, ['seq_id']) print_update_info() start = time.time() conn = mongodb.get_connection(args.ip, args.port, args.user, args.password) mongo_version = conn.server_info()['version'] standard = standard.replace('.', '_') g = scores.groupby('identity') groups = regroup(g.groups) for g in range(0, len(groups), args.update_threads): tlist = [] for group in groups[g:g + args.update_threads]: t = Thread(target=update, args=(db, collection, group, standard, mongo_version, args)) t.start() tlist.append(t) for t in tlist: t.join() progbar.progress_bar(g + args.update_threads, len(groups)) # if platform.system().lower() == 'darwin' or args.debug or args.single_process_update: # for i, group in enumerate(groups): # update(db, collection, group, standard, mongo_version, args) # progbar.progress_bar(i, len(groups)) # else: # p = mp.Pool(processes=25) # async_results = [] # for group in groups: # async_results.append(p.apply_async(update, args=(db, collection, group, standard, mongo_version, args))) # monitor_update(async_results) # p.close() # p.join() print('') run_time = time.time() - start logger.info('Updating took {} seconds. ({} sequences per second)'.format(round(run_time, 2), round(len(scores) / run_time, 1)))