Esempio n. 1
0
if use_index_batch_crawl:
    data = {}
    for source_lru, target_lru in LINKS:
        if source_lru in data:
            links = data[source_lru]
        else:
            links = []
        links.append(target_lru)
        data[source_lru] = links
    report = traph.index_batch_crawl(data)
    webentity_store.data['webentities'].update(report.created_webentities)
else:
    for lru in PAGES:
        # add page
        report = traph.add_page(lru)
        webentity_store.data['webentities'].update(report.created_webentities)

    # add links
    links_report = traph.add_links(LINKS)
    webentity_store.data['webentities'].update(
        links_report.created_webentities)

print '...data stored.'

# Log result
print '\nPages:'
for node, lru in traph.pages_iter():
    print ' - ' + lru

print '\nPage Links:'
Esempio n. 2
0
}

webentity_store = WebEntityStore('./scripts/data/webentities.json')

traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)
trie = traph.lru_trie
links = traph.link_store

print trie.header
print links.header

for page in PAGES:
    traph.add_page(page)

traph.add_links(LINKS)

for source_lru, target_lru in traph.links_iter():
    print 'Source: %s, Target: %s' % (source_lru, target_lru)

for node in links.nodes_iter():
    print node

print '\nDetailed DFS...'
g = nx.Graph()
for state in trie.detailed_dfs_iter():
    print state

    g.add_node(state.node.block, label=state.node.char_as_str())
Esempio n. 3
0
print '\nResult - Existing webentities from Store:'
for weid, prefixes in webentity_store.data['webentities'].items():
    print ' - Webentity %s:' % (weid)
    for prefix in prefixes:
        print '\t\t' + prefix

print '\nResult - Prefixes from Traph:'
for node, lru in traph.webentity_prefix_iter():
    print ' - (%s) \t%s' % (node.webentity(), lru)

# Step 4
print '\n:: Step 4 - Add the "Airbus/blog" page'
print 'Expected: Create the NON-HTTPS Airbus webentity'

report = traph.add_page('s:http|h:com|h:airbus|p:blog|')
webentity_store.data['webentities'].update(report.created_webentities)

print '\nResult - Existing webentities from Store:'
for weid, prefixes in webentity_store.data['webentities'].items():
    print ' - Webentity %s:' % (weid)
    for prefix in prefixes:
        print '\t\t' + prefix

print '\nResult - Prefixes from Traph:'
for node, lru in traph.webentity_prefix_iter():
    print ' - (%s) \t%s' % (node.webentity(), lru)

print '\nResult - Airbus blog page belongs to webentity %s via prefix %s' % (
    traph.retrieve_webentity('s:http|h:com|h:airbus|p:blog|'),
    traph.retrieve_prefix('s:http|h:com|h:airbus|p:blog|'))
# Though the traph could retrieve them, it would not be efficient.
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Store data
print 'Store pages...'
for page in PAGES:
    report = traph.add_page(page)
    webentity_store.data['webentities'].update(report.created_webentities)
    # print report

print 'Store links...'
links_report = traph.add_links(LINKS)
webentity_store.data['webentities'].update(links_report.created_webentities)
# print links_report

print '...data stored.'

# Log result
print '\nPages:'
for node, lru in traph.pages_iter():
    print ' - ' + lru
Esempio n. 5
0
# In a real situation, these would be tracked elsewhere.
# That's what we are simulating with this store.
webentity_store = WebEntityStore('./scripts/data/webentities.json')
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Step 1
print '\n:: Step 1: Add the "Madrid" page'
print 'Expected: "Europe" webentity created (matching the rule given at init), "World" not created'

report = traph.add_page('s:http|h:com|h:world|p:europe|p:spain|p:madrid|')
webentity_store.data['webentities'].update(report.created_webentities)

print '\nResult - Existing webentities:'
for weid, prefixes in webentity_store.data['webentities'].items():
    print ' - Webentity %s\t%s + %s other prefixes' % (weid, prefixes[0],
                                                       len(prefixes) - 1)

# Step 2
print '\n:: Step 2: Remove the "Continents" rule and add the "Tokyo" page'
print 'Expected: "World" webentity created, "Asia" not created'

traph.remove_webentity_creation_rule('s:http|h:com|h:world|')
report = traph.add_page('s:http|h:com|h:world|p:asia|p:japan|p:tokyo|')
webentity_store.data['webentities'].update(report.created_webentities)