def parse(columns, metadata, lines): status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) tokens = line.split('\t') if not len(tokens) == len(columns) + 3: continue source = tokens[0] target = tokens[1] # humannet composite score #score = float(tokens[-1]) for column, token in itertools.izip(columns, tokens[2:-1]): try: # individual edge score score = float(token) metadata[column]['count'] += 1 yield { 'source': source, 'target': target, 'score': score, 'meta': metadata[column]['_id'] } except ValueError: pass status.stop()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'identifiers', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) log.info('dropping old edge data') edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}}) status.stop() return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--id', action='store_true', help='load identifiers only') parser.add_argument('--batch', type=int, default=10000, help='insert records batch size') parser.add_argument('--warmstart', action='store_true', help='warmstart') args = parser.parse_args() if not args.warmstart: load_identifiers() if not args.id: client = pymongo.MongoClient() db = client.networks # collection stores metadata about source networks meta = db.meta # collection stores edge data edges = db.edges create_edges_index() url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt' log.info('reading network list from %s', url) r = requests.get(url) lines = list(r.iter_lines())[1:] # ignore header line status = Status('networks', logger=log).n(len(lines)).start() for idx, line in enumerate(lines): status.log(idx) file_name, network_group_name, network_name, source, pubmed_id = line.split('\t') metadata = { 'collection': 'genemania', 'type': network_group_name.lower(), 'source': source, 'name': network_name, 'pubmed': int(pubmed_id) if not pubmed_id == '' else 0 } if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None: # old metadata records and their associated edges will be dropped after the new network is finished processing _ids = [result['_id'] for result in meta.find(metadata)] log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids])) set_status(metadata, 'parsing') _id = meta.insert_one(metadata).inserted_id metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch) log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count']) set_status(metadata, 'success') meta.save(metadata) if len(_ids) > 0: log.info('dropping old network metadata') meta.delete_many({'_id': {'$in': _ids}}) cleanup_edges() status.stop() return 0