def parse(columns, metadata, lines): status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) tokens = line.split('\t') if not len(tokens) == len(columns) + 3: continue source = tokens[0] target = tokens[1] # humannet composite score #score = float(tokens[-1]) for column, token in itertools.izip(columns, tokens[2:-1]): try: # individual edge score score = float(token) metadata[column]['count'] += 1 yield { 'source': source, 'target': target, 'score': score, 'meta': metadata[column]['_id'] } except ValueError: pass status.stop()
def load_entrez_identifiers(): db = pymongo.MongoClient().identifiers db.genemania_entrez.drop() collection = db.genemania_entrez url = 'http://genemania.org/data/current/Homo_sapiens/identifier_mappings.txt' status = Status('loading genemania identifiers from ' + url, logger=log).start() r = requests.get(url) lines = r.iter_lines() lines.next() # ignore header row def parse(lines): for line in lines: try: preferred, name, source = line.split('\t') if (source == 'Entrez Gene ID'): yield { 'preferred': preferred, 'name': name, 'NAME': name.upper( ), # indexed to support case-insensitive queries 'source': source } except Exception as e: log.warn(e.message) count = 0 iterator = parse(lines) while True: records = [record for record in islice(iterator, 1000)] if len(records) > 0: count += len(collection.insert_many(records).inserted_ids) log.debug('inserted %d identifiers (%d total)', len(records), count) else: break log.info('creating NAME and preferred indexes') collection.create_index([("NAME", pymongo.ASCENDING)]) collection.create_index([("preferred", pymongo.ASCENDING)]) status.stop()
def load_identifiers(): db = pymongo.MongoClient().identifiers db.genemania.drop() collection = db.genemania url = 'http://genemania.org/data/current/Danio_rerio/identifier_mappings.txt' status = Status('loading genemania identifiers from ' + url, logger=log).start() r = requests.get(url) lines = r.iter_lines() lines.next() # ignore header row def parse(lines): for line in lines: try: preferred, name, source = line.split('\t') yield { 'preferred': preferred, 'name': name, 'NAME': name.upper(), # indexed to support case-insensitive queries 'source': source } except Exception as e: log.warn(e.message) count = 0 iterator = parse(lines) while True: records = [record for record in islice(iterator, 1000)] if len(records) > 0: count += len(collection.insert_many(records).inserted_ids) log.debug('inserted %d identifiers (%d total)', len(records), count) else: break log.info('creating NAME and preferred indexes') collection.create_indexes([ pymongo.IndexModel([('NAME', pymongo.ASCENDING)]), pymongo.IndexModel([('preferred', pymongo.ASCENDING)]) ]) status.stop()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'identifiers', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) log.info('dropping old edge data') edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}}) status.stop() return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--id', action='store_true', help='load identifiers only') parser.add_argument('--batch', type=int, default=10000, help='insert records batch size') parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() if not args.id: client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'genemania', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) cleanup_edges() status.stop() return 0