webentity_creation_rules=webentity_creation_rules) print '\n:: Store network...' use_index_batch_crawl = True if use_index_batch_crawl: data = {} for source_lru, target_lru in LINKS: if source_lru in data: links = data[source_lru] else: links = [] links.append(target_lru) data[source_lru] = links report = traph.index_batch_crawl(data) webentity_store.data['webentities'].update(report.created_webentities) else: for lru in PAGES: # add page report = traph.add_page(lru) webentity_store.data['webentities'].update(report.created_webentities) # add links links_report = traph.add_links(LINKS) webentity_store.data['webentities'].update( links_report.created_webentities) print '...data stored.' # Log result
links_multimap = defaultdict(list) i = 0 links = [] for page in collection.find({}, {'lru': 1, 'lrulinks': 1}, sort=[('_job', 1)]): i += 1 # links.extend(links_generator(page)) links_multimap[page['lru']].extend(page['lrulinks']) # traph.add_links(links_generator(page)) if i % 100 == 0: print '(%i) [%i] - %s' % (i, len(page['lrulinks']), page['lru']) # for link in page['lrulinks']: # traph.add_page(link) print 'Gathered links' # traph.add_links(links) # print links_multimap # print links # print len(links) # print sum([len(i) for i in links_multimap.values()]) traph.index_batch_crawl(links_multimap) traph.close()