if use_index_batch_crawl: data = {} for source_lru, target_lru in LINKS: if source_lru in data: links = data[source_lru] else: links = [] links.append(target_lru) data[source_lru] = links report = traph.index_batch_crawl(data) webentity_store.data['webentities'].update(report.created_webentities) else: for lru in PAGES: # add page report = traph.add_page(lru) webentity_store.data['webentities'].update(report.created_webentities) # add links links_report = traph.add_links(LINKS) webentity_store.data['webentities'].update( links_report.created_webentities) print '...data stored.' # Log result print '\nPages:' for node, lru in traph.pages_iter(): print ' - ' + lru print '\nPage Links:'
} webentity_store = WebEntityStore('./scripts/data/webentities.json') traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) trie = traph.lru_trie links = traph.link_store print trie.header print links.header for page in PAGES: traph.add_page(page) traph.add_links(LINKS) for source_lru, target_lru in traph.links_iter(): print 'Source: %s, Target: %s' % (source_lru, target_lru) for node in links.nodes_iter(): print node print '\nDetailed DFS...' g = nx.Graph() for state in trie.detailed_dfs_iter(): print state g.add_node(state.node.block, label=state.node.char_as_str())
print '\nResult - Existing webentities from Store:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s:' % (weid) for prefix in prefixes: print '\t\t' + prefix print '\nResult - Prefixes from Traph:' for node, lru in traph.webentity_prefix_iter(): print ' - (%s) \t%s' % (node.webentity(), lru) # Step 4 print '\n:: Step 4 - Add the "Airbus/blog" page' print 'Expected: Create the NON-HTTPS Airbus webentity' report = traph.add_page('s:http|h:com|h:airbus|p:blog|') webentity_store.data['webentities'].update(report.created_webentities) print '\nResult - Existing webentities from Store:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s:' % (weid) for prefix in prefixes: print '\t\t' + prefix print '\nResult - Prefixes from Traph:' for node, lru in traph.webentity_prefix_iter(): print ' - (%s) \t%s' % (node.webentity(), lru) print '\nResult - Airbus blog page belongs to webentity %s via prefix %s' % ( traph.retrieve_webentity('s:http|h:com|h:airbus|p:blog|'), traph.retrieve_prefix('s:http|h:com|h:airbus|p:blog|'))
# Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Store data print 'Store pages...' for page in PAGES: report = traph.add_page(page) webentity_store.data['webentities'].update(report.created_webentities) # print report print 'Store links...' links_report = traph.add_links(LINKS) webentity_store.data['webentities'].update(links_report.created_webentities) # print links_report print '...data stored.' # Log result print '\nPages:' for node, lru in traph.pages_iter(): print ' - ' + lru
# In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Step 1 print '\n:: Step 1: Add the "Madrid" page' print 'Expected: "Europe" webentity created (matching the rule given at init), "World" not created' report = traph.add_page('s:http|h:com|h:world|p:europe|p:spain|p:madrid|') webentity_store.data['webentities'].update(report.created_webentities) print '\nResult - Existing webentities:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s\t%s + %s other prefixes' % (weid, prefixes[0], len(prefixes) - 1) # Step 2 print '\n:: Step 2: Remove the "Continents" rule and add the "Tokyo" page' print 'Expected: "World" webentity created, "Asia" not created' traph.remove_webentity_creation_rule('s:http|h:com|h:world|') report = traph.add_page('s:http|h:com|h:world|p:asia|p:japan|p:tokyo|') webentity_store.data['webentities'].update(report.created_webentities)