def db_index(name, host): colls = db_connect(name, host) #index sample_sets collection logging.info('indexing sample_sets collection') ss = colls['sample_sets'] ss.ensure_index('name') #indexing results collection logging.info('indexing results collection') results = colls['results'] results.ensure_index([('t_id', pymongo.ASCENDING), ('ss_id', pymongo.ASCENDING)]) results.ensure_index([('ss_id', pymongo.ASCENDING), ('nes', pymongo.ASCENDING)]) #create indexes for the merged collection #create compound index for ss_id + each other field in reports client = pymongo.MongoClient(host) db = client[name] merged = db.merged for field in fields_results: if field != 'ss_id': merged.ensure_index([('ss_id', pymongo.ASCENDING), (field, pymongo.ASCENDING)]) #create three item compound index to be used when a category filter is applied merged.ensure_index([('ss_id', pymongo.ASCENDING), ('category', pymongo.ASCENDING), (field, pymongo.ASCENDING)]) #create indexes for regex queries merged.ensure_index([('ss_id', pymongo.ASCENDING), (field, pymongo.ASCENDING), ('nearest_gene_names', pymongo.ASCENDING)]) fields_trans.append('loc_strand') for field in fields_trans: if field != 'category': merged.ensure_index([('ss_id', pymongo.ASCENDING), (field, pymongo.ASCENDING)]) #create three item compound index to be used when a category filter is applied merged.ensure_index([('ss_id', pymongo.ASCENDING), ('category', pymongo.ASCENDING), (field, pymongo.ASCENDING)]) #create indexes for regex queries merged.ensure_index([('ss_id', pymongo.ASCENDING), (field, pymongo.ASCENDING), ('nearest_gene_names', pymongo.ASCENDING)])
def db_metadata_import(input_dir, name, host): logging.info('importing ssea data to %s database on mongo server: %s' % (name, host)) logging.debug("Importing samples file") _ssea_path = ssea.__path__[0] _merge_path = os.path.join(_ssea_path, 'utils/mongo_metadata_printJSON.py') p1 = subprocess.Popen(['python', _merge_path, input_dir, '-c'], stdout=subprocess.PIPE) p2 = subprocess.Popen( ['mongoimport', '-c', 'samples', '--host', host, '-d', name], stdin=p1.stdout) p1.wait() p2.wait() logging.info("Importing transcripts file") p1 = subprocess.Popen(['python', _merge_path, input_dir, '-r'], stdout=subprocess.PIPE) p2 = subprocess.Popen( ['mongoimport', '-c', 'transcripts', '--host', host, '-d', name], stdin=p1.stdout) p1.wait() p2.wait() sample_index_fields = [ 'library_id', 'tcga_disease_type', 'assembly_cohort', 'fragment_length_mean', 'cohort', 'cancer_progression', 'tcga_legacy_id' ] transcript_index_fields = [ 'category', 'nearest_gene_names', 'gene_id', 'locus', 'name' ] colls = db_connect(name, host) samples = colls['samples'] logging.info('indexing samples collection') for field in sample_index_fields: samples.ensure_index(field) transcripts = colls['transcripts'] logging.info('indexing transcripts collection') for field in sample_index_fields: transcripts.ensure_index(field)
def db_metadata_import(input_dir, name, host): logging.info('importing ssea data to %s database on mongo server: %s' % (name, host)) logging.debug("Importing samples file") _ssea_path = ssea.__path__[0] _merge_path = os.path.join(_ssea_path, 'utils/mongo_metadata_printJSON.py') p1 = subprocess.Popen(['python', _merge_path, input_dir,'-c'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'samples', '--host', host, '-d', name], stdin=p1.stdout) p1.wait() p2.wait() logging.info("Importing transcripts file") p1 = subprocess.Popen(['python', _merge_path, input_dir,'-r'], stdout=subprocess.PIPE) p2 = subprocess.Popen(['mongoimport', '-c', 'transcripts', '--host', host, '-d', name], stdin=p1.stdout) p1.wait() p2.wait() sample_index_fields = ['library_id', 'tcga_disease_type', 'assembly_cohort', 'fragment_length_mean', 'cohort', 'cancer_progression', 'tcga_legacy_id'] transcript_index_fields = ['category', 'nearest_gene_names', 'gene_id', 'locus', 'name'] colls = db_connect(name, host) samples = colls['samples'] logging.info('indexing samples collection') for field in sample_index_fields: samples.ensure_index(field) transcripts = colls['transcripts'] logging.info('indexing transcripts collection') for field in sample_index_fields: transcripts.ensure_index(field)