webentity_store = WebEntityStore('./scripts/data/webentities.json') traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) trie = traph.lru_trie links = traph.link_store print trie.header print links.header for page in PAGES: traph.add_page(page) traph.add_links(LINKS) for source_lru, target_lru in traph.links_iter(): print 'Source: %s, Target: %s' % (source_lru, target_lru) for node in links.nodes_iter(): print node print '\nDetailed DFS...' g = nx.Graph() for state in trie.detailed_dfs_iter(): print state g.add_node(state.node.block, label=state.node.char_as_str()) if state.node.is_root():
if source_lru in data: links = data[source_lru] else: links = [] links.append(target_lru) data[source_lru] = links report = traph.index_batch_crawl(data) webentity_store.data['webentities'].update(report.created_webentities) else: for lru in PAGES: # add page report = traph.add_page(lru) webentity_store.data['webentities'].update(report.created_webentities) # add links links_report = traph.add_links(LINKS) webentity_store.data['webentities'].update( links_report.created_webentities) print '...data stored.' # Log result print '\nPages:' for node, lru in traph.pages_iter(): print ' - ' + lru print '\nPage Links:' i = 0 for source_lru, target_lru in traph.links_iter(): i += 1 print ' - %s\t-> %s' % (source_lru, target_lru)
else: for i in range(len(SOURCE_PAGES)): lru = SOURCE_PAGES[i] # add page report = traph.add_page(lru) webentity_store.data['webentities'].update(report.created_webentities) # build links links = [] for j in range(len(TARGET_PAGES)): if j % 4 == i: links.append([lru, TARGET_PAGES[j]]) # add links links_report = traph.add_links(links) webentity_store.data['webentities'].update( links_report.created_webentities) print '\n:: Stats' print '- %s webentities in the Store' % (len( webentity_store.data['webentities'])) webentities = set() for node, lru in traph.webentity_prefix_iter(): webentities.add(node.webentity()) print '- %s webentities in the Traph' % (len(webentities)) pages = [] for node, lru in traph.lru_trie.dfs_iter(): if node.is_page(): pages.append(lru) print '- %s pages in the Traph' % (len(pages))