Esempio n. 1
0
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Store network...'

use_index_batch_crawl = True

if use_index_batch_crawl:
    data = {}
    for source_lru, target_lru in LINKS:
        if source_lru in data:
            links = data[source_lru]
        else:
            links = []
        links.append(target_lru)
        data[source_lru] = links
    report = traph.index_batch_crawl(data)
    webentity_store.data['webentities'].update(report.created_webentities)
else:
    for lru in PAGES:
        # add page
        report = traph.add_page(lru)
        webentity_store.data['webentities'].update(report.created_webentities)

    # add links
    links_report = traph.add_links(LINKS)
    webentity_store.data['webentities'].update(
        links_report.created_webentities)

print '...data stored.'

# Log result
Esempio n. 2
0

links_multimap = defaultdict(list)

i = 0
links = []
for page in collection.find({}, {'lru': 1, 'lrulinks': 1}, sort=[('_job', 1)]):
    i += 1

    # links.extend(links_generator(page))

    links_multimap[page['lru']].extend(page['lrulinks'])

    # traph.add_links(links_generator(page))

    if i % 100 == 0:
        print '(%i) [%i] - %s' % (i, len(page['lrulinks']), page['lru'])

    # for link in page['lrulinks']:
    #     traph.add_page(link)

print 'Gathered links'
# traph.add_links(links)
# print links_multimap
# print links
# print len(links)
# print sum([len(i) for i in links_multimap.values()])
traph.index_batch_crawl(links_multimap)

traph.close()