Beispiel #1
0
def parse(columns, metadata, lines):
    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        tokens = line.split('\t')

        if not len(tokens) == len(columns) + 3:
            continue

        source = tokens[0]
        target = tokens[1]

        # humannet composite score
        #score = float(tokens[-1])

        for column, token in itertools.izip(columns, tokens[2:-1]):
            try:
                # individual edge score
                score = float(token)
                metadata[column]['count'] += 1
                yield {
                    'source': source,
                    'target': target,
                    'score': score,
                    'meta': metadata[column]['_id']
                }
            except ValueError:
                pass

    status.stop()
Beispiel #2
0
def parse(columns, metadata, lines):
    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        tokens = line.split('\t')

        if not len(tokens) == len(columns) + 3:
            continue

        source = tokens[0]
        target = tokens[1]

        # humannet composite score
        #score = float(tokens[-1])

        for column, token in itertools.izip(columns, tokens[2:-1]):
            try:
                # individual edge score
                score = float(token)
                metadata[column]['count'] += 1
                yield {
                    'source': source,
                    'target': target,
                    'score': score,
                    'meta': metadata[column]['_id']
                }
            except ValueError:
                pass

    status.stop()
Beispiel #3
0
def load_entrez_identifiers():
    db = pymongo.MongoClient().identifiers
    db.genemania_entrez.drop()
    collection = db.genemania_entrez
    url = 'http://genemania.org/data/current/Homo_sapiens/identifier_mappings.txt'

    status = Status('loading genemania identifiers from ' + url,
                    logger=log).start()

    r = requests.get(url)
    lines = r.iter_lines()
    lines.next()  # ignore header row

    def parse(lines):
        for line in lines:
            try:
                preferred, name, source = line.split('\t')
                if (source == 'Entrez Gene ID'):
                    yield {
                        'preferred': preferred,
                        'name': name,
                        'NAME': name.upper(
                        ),  # indexed to support case-insensitive queries
                        'source': source
                    }
            except Exception as e:
                log.warn(e.message)

    count = 0
    iterator = parse(lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            count += len(collection.insert_many(records).inserted_ids)
            log.debug('inserted %d identifiers (%d total)', len(records),
                      count)
        else:
            break

    log.info('creating NAME and preferred indexes')

    collection.create_index([("NAME", pymongo.ASCENDING)])

    collection.create_index([("preferred", pymongo.ASCENDING)])

    status.stop()
Beispiel #4
0
def load_identifiers():
    db = pymongo.MongoClient().identifiers
    db.genemania.drop()
    collection = db.genemania
    url = 'http://genemania.org/data/current/Danio_rerio/identifier_mappings.txt'

    status = Status('loading genemania identifiers from ' + url, logger=log).start()

    r = requests.get(url)
    lines = r.iter_lines()
    lines.next() # ignore header row

    def parse(lines):
        for line in lines:
            try:
                preferred, name, source = line.split('\t')
                yield {
                    'preferred': preferred,
                    'name': name,
                    'NAME': name.upper(), # indexed to support case-insensitive queries
                    'source': source
                }
            except Exception as e:
                log.warn(e.message)

    count = 0
    iterator = parse(lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            count += len(collection.insert_many(records).inserted_ids)
            log.debug('inserted %d identifiers (%d total)', len(records), count)
        else:
            break

    log.info('creating NAME and preferred indexes')
    collection.create_indexes([
        pymongo.IndexModel([('NAME', pymongo.ASCENDING)]),
        pymongo.IndexModel([('preferred', pymongo.ASCENDING)])
    ])

    status.stop()
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    create_edges_index()

    url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())[1:] # ignore header line

    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

        metadata = {
            'collection': 'identifiers',
            'type': network_group_name.lower(),
            'source': source,
            'name': network_name,
            'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
        }

        if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

            # old metadata records and their associated edges will be dropped after the new network is finished processing
            _ids = [result['_id'] for result in meta.find(metadata)]
            log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

            set_status(metadata, 'parsing')
            _id = meta.insert_one(metadata).inserted_id

            metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id)
            log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

            set_status(metadata, 'success')
            meta.save(metadata)

            if len(_ids) > 0:
                log.info('dropping old network metadata')
                meta.delete_many({'_id': {'$in': _ids}})

    log.info('dropping old edge data')
    edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}})

    status.stop()
    return 0
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--id', action='store_true', help='load identifiers only')
    parser.add_argument('--batch', type=int, default=10000, help='insert records batch size')
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    if not args.id:

        client = pymongo.MongoClient()
        db = client.networks

        # collection stores metadata about source networks
        meta = db.meta

        # collection stores edge data
        edges = db.edges

        create_edges_index()

        url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
        log.info('reading network list from %s', url)
        r = requests.get(url)
        lines = list(r.iter_lines())[1:] # ignore header line

        status = Status('networks', logger=log).n(len(lines)).start()
        for idx, line in enumerate(lines):
            status.log(idx)
            file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

            metadata = {
                'collection': 'genemania',
                'type': network_group_name.lower(),
                'source': source,
                'name': network_name,
                'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
            }

            if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

                # old metadata records and their associated edges will be dropped after the new network is finished processing
                _ids = [result['_id'] for result in meta.find(metadata)]
                log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

                set_status(metadata, 'parsing')
                _id = meta.insert_one(metadata).inserted_id

                metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch)
                log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

                set_status(metadata, 'success')
                meta.save(metadata)

                if len(_ids) > 0:
                    log.info('dropping old network metadata')
                    meta.delete_many({'_id': {'$in': _ids}})

        cleanup_edges()

        status.stop()

    return 0