Example #1
0
def parse(columns, metadata, lines):
    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        tokens = line.split('\t')

        if not len(tokens) == len(columns) + 3:
            continue

        source = tokens[0]
        target = tokens[1]

        # humannet composite score
        #score = float(tokens[-1])

        for column, token in itertools.izip(columns, tokens[2:-1]):
            try:
                # individual edge score
                score = float(token)
                metadata[column]['count'] += 1
                yield {
                    'source': source,
                    'target': target,
                    'score': score,
                    'meta': metadata[column]['_id']
                }
            except ValueError:
                pass

    status.stop()
Example #2
0
def parse(columns, metadata, lines):
    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        tokens = line.split('\t')

        if not len(tokens) == len(columns) + 3:
            continue

        source = tokens[0]
        target = tokens[1]

        # humannet composite score
        #score = float(tokens[-1])

        for column, token in itertools.izip(columns, tokens[2:-1]):
            try:
                # individual edge score
                score = float(token)
                metadata[column]['count'] += 1
                yield {
                    'source': source,
                    'target': target,
                    'score': score,
                    'meta': metadata[column]['_id']
                }
            except ValueError:
                pass

    status.stop()
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    create_edges_index()

    url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())[1:] # ignore header line

    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

        metadata = {
            'collection': 'identifiers',
            'type': network_group_name.lower(),
            'source': source,
            'name': network_name,
            'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
        }

        if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

            # old metadata records and their associated edges will be dropped after the new network is finished processing
            _ids = [result['_id'] for result in meta.find(metadata)]
            log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

            set_status(metadata, 'parsing')
            _id = meta.insert_one(metadata).inserted_id

            metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id)
            log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

            set_status(metadata, 'success')
            meta.save(metadata)

            if len(_ids) > 0:
                log.info('dropping old network metadata')
                meta.delete_many({'_id': {'$in': _ids}})

    log.info('dropping old edge data')
    edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}})

    status.stop()
    return 0
Example #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--id', action='store_true', help='load identifiers only')
    parser.add_argument('--batch', type=int, default=10000, help='insert records batch size')
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    if not args.id:

        client = pymongo.MongoClient()
        db = client.networks

        # collection stores metadata about source networks
        meta = db.meta

        # collection stores edge data
        edges = db.edges

        create_edges_index()

        url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
        log.info('reading network list from %s', url)
        r = requests.get(url)
        lines = list(r.iter_lines())[1:] # ignore header line

        status = Status('networks', logger=log).n(len(lines)).start()
        for idx, line in enumerate(lines):
            status.log(idx)
            file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

            metadata = {
                'collection': 'genemania',
                'type': network_group_name.lower(),
                'source': source,
                'name': network_name,
                'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
            }

            if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

                # old metadata records and their associated edges will be dropped after the new network is finished processing
                _ids = [result['_id'] for result in meta.find(metadata)]
                log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

                set_status(metadata, 'parsing')
                _id = meta.insert_one(metadata).inserted_id

                metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch)
                log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

                set_status(metadata, 'success')
                meta.save(metadata)

                if len(_ids) > 0:
                    log.info('dropping old network metadata')
                    meta.delete_many({'_id': {'$in': _ids}})

        cleanup_edges()

        status.stop()

    return 0