def main_loop(): db_connect() record_list = get_live_record_list(92613) info_col = get_info_col(db.mongo_client) print("Record list count: {}".format(len(record_list))) for record in record_list: if not info_col.count_documents({"rid": record.rid}): print("Start to crawling record {}".format(record.rid)) try: save_record(record.rid) except Exception as e: print("Fail in crawling record, rid is {}".format(record.rid))
def db_ssea_import(ssea_dir, matrix_dir, name, host): sample_sets_json_file = os.path.join(ssea_dir, 'sample_set.json') ss_name = json.loads(open(sample_sets_json_file).read())['name'] colls = db_connect(name, host) ss = colls['sample_sets'] ss_check = ss.find_one({'name':ss_name}) if ss_check != None: logging.info('Sample set: \'%s\' already in database' % ss_name) else: logging.info('importing sample set \'%s\' to %s database on mongo server: %s' % (ss_name, name, host)) # logging.debug("Importing sample_set file") #get ss_id by checking current number of ss in database ss_id = str(ss.count()) _ssea_path = ssea.__path__[0] _merge_path = os.path.join(_ssea_path, 'utils/mongo_ssea_printJSON.py') p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-s'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'sample_sets', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout) p1.wait() p2.wait() ss.update({'_id':int(ss_id)}, {'$set':{'name':'TMP'}}) # logging.info("Importing config file") p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-c'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'configs', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout) p1.wait() p2.wait() # logging.info("Importing results file") p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '-r'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'results', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout) p1.wait() p2.wait() # logging.info("Importing hists file") p1 = subprocess.Popen(['python', _merge_path, ssea_dir, matrix_dir, ss_id, '--hist'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'hists', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout) p1.wait() p2.wait() # logging.info("Creating merge collection") _merge_path = os.path.join(_ssea_path, 'utils/mongo_merge_printJSON.py') p1 = subprocess.Popen(['python', _merge_path, '--ss_id', ss_id, '--host', host, '--name', name], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'merged', '--host', host, '-d', name, '--upsert'], stdin=p1.stdout) p1.wait() p2.wait() ss.update({'_id':int(ss_id)}, {'$set':{'name':ss_name}}) logging.info("Finished importing \'%s\'" % ss_name)
def db_delete_ss(name, host, ss_id): colls = db_connect(name, host) sample_sets = colls['sample_sets'] configs = colls['configs'] results = colls['results'] hists = colls['hists'] merged = colls['merged'] ss_name = sample_sets.find_one({'_id':ss_id})['name'] #check for incomplete imports logging.info('Removing sample set \'%s\'' % ss_name) sample_sets.remove({'_id': ss_id}) configs.remove({'_id': ss_id}) hists.remove({'_id': ss_id}) results.remove({'ss_id': ss_id}) merged.remove({'ss_id': ss_id})
def db_delete_ss(name, host, ss_id): colls = db_connect(name, host) sample_sets = colls['sample_sets'] configs = colls['configs'] results = colls['results'] hists = colls['hists'] merged = colls['merged'] ss_name = sample_sets.find_one({'_id': ss_id})['name'] #check for incomplete imports logging.info('Removing sample set \'%s\'' % ss_name) sample_sets.remove({'_id': ss_id}) configs.remove({'_id': ss_id}) hists.remove({'_id': ss_id}) results.remove({'ss_id': ss_id}) merged.remove({'ss_id': ss_id})
def db_repair(name, host): colls = db_connect(name, host) sample_sets = colls['sample_sets'] configs = colls['configs'] results = colls['results'] hists = colls['hists'] merged = colls['merged'] #check for incomplete imports logging.info('Removing incomplete imports') tmps = sample_sets.find({'name': 'TMP'}) tmp_ids = [] for ss in tmps: tmp_ids.append(ss['_id']) sample_sets.remove({'_id': {'$in': tmp_ids}}) configs.remove({'_id': {'$in': tmp_ids}}) hists.remove({'_id': {'$in': tmp_ids}}) results.remove({'ss_id': {'$in': tmp_ids}}) merged.remove({'ss_id': {'$in': tmp_ids}})
def db_repair(name, host): colls = db_connect(name, host) sample_sets = colls['sample_sets'] configs = colls['configs'] results = colls['results'] hists = colls['hists'] merged = colls['merged'] #check for incomplete imports logging.info('Removing incomplete imports') tmps = sample_sets.find({'name':'TMP'}) tmp_ids = [] for ss in tmps: tmp_ids.append(ss['_id']) sample_sets.remove({'_id': {'$in': tmp_ids}}) configs.remove({'_id': {'$in': tmp_ids}}) hists.remove({'_id': {'$in': tmp_ids}}) results.remove({'ss_id': {'$in': tmp_ids}}) merged.remove({'ss_id': {'$in': tmp_ids}})
def main(argv=None): '''Command line options.''' # create instance of run configuration # Setup argument parser parser = argparse.ArgumentParser() # Add command line parameters parser.add_argument("-n", "--name", dest='name', default='compendia', help='name for ssea run (will be name of database)') parser.add_argument("--host", dest='host', default='localhost:27017', help='name of mongodb server to connect to') parser.add_argument("--ss_id", dest='ss_id', help='mongo _id number of sample set being merged') # Process arguments args = parser.parse_args() # setup logging level = logging.DEBUG logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") colls = db_connect(args.name, args.host) transcripts = colls['transcripts'] results = colls['results'] #parse through transcript metadata and build dict to be used during merge trans_dict = collections.defaultdict(lambda: {}) logging.info('Parsing through transcript metadata to prepare merge') tot = transcripts.find().count() i = 0 for x in transcripts.find(): i += 1 if (i % 50000) == 0: logging.debug('Finished %d/%d' % (i, tot)) key = x['_id'] #create a dict placeholder for this _id element id_dict = {} for field in fields_trans: id_dict[field] = x[field] #create a combined locus and strand field locus = x['locus'] strand = x['strand'] new_loc = locus + '(' + strand + ')' id_dict['loc_strand'] = new_loc trans_dict[key] = id_dict ss_id = int(args.ss_id) #print merged json tot = results.find({'ss_id': ss_id}).count() logging.info( 'Merging transcript metadata and results fields (%d total merged documents)' % tot) fields_results.append('_id') for x in results.find({'ss_id': ss_id}): #create another dict placeholder to be printed as JSON dict = {} for field in fields_results: if field == '_id': dict[field] = str(x[field]) else: dict[field] = x[field] id = x['t_id'] trans_meta = trans_dict[id] for key in trans_meta.iterkeys(): dict[key] = trans_meta[key] print json.dumps(dict)
def main(argv=None): '''Command line options.''' # create instance of run configuration # Setup argument parser parser = argparse.ArgumentParser() # Add command line parameters parser.add_argument("-n", "--name", dest = 'name', default = 'compendia', help = 'name for ssea run (will be name of database)') parser.add_argument("--host", dest = 'host', default = 'localhost:27017', help = 'name of mongodb server to connect to') parser.add_argument("--ss_id", dest = 'ss_id', help = 'mongo _id number of sample set being merged') # Process arguments args = parser.parse_args() # setup logging level = logging.DEBUG logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") colls = db_connect(args.name, args.host) transcripts = colls['transcripts'] results = colls['results'] #parse through transcript metadata and build dict to be used during merge trans_dict = collections.defaultdict(lambda: {}) logging.info('Parsing through transcript metadata to prepare merge') tot = transcripts.find().count() i = 0 for x in transcripts.find(): i+=1 if (i % 50000) == 0: logging.debug('Finished %d/%d' % (i, tot)) key = x['_id'] #create a dict placeholder for this _id element id_dict = {} for field in fields_trans: id_dict[field] = x[field] #create a combined locus and strand field locus = x['locus'] strand = x['strand'] new_loc = locus + '(' + strand + ')' id_dict['loc_strand'] = new_loc trans_dict[key] = id_dict ss_id = int(args.ss_id) #print merged json tot = results.find({'ss_id':ss_id}).count() logging.info('Merging transcript metadata and results fields (%d total merged documents)' % tot) fields_results.append('_id') for x in results.find({'ss_id':ss_id}): #create another dict placeholder to be printed as JSON dict = {} for field in fields_results: if field == '_id': dict[field] = str(x[field]) else: dict[field] = x[field] id = x['t_id'] trans_meta = trans_dict[id] for key in trans_meta.iterkeys(): dict[key] = trans_meta[key] print json.dumps(dict)