# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Simulate a crawl:'
print ' - Create webentity for "s:http|h:com|h:professor|p:augustine|p:sycamore|"'
professor_prefixes = [
    's:http|h:com|h:professor|p:augustine|p:sycamore|',
    's:http|h:com|h:professor|h:www|p:augustine|p:sycamore|',
    's:https|h:com|h:professor|p:augustine|p:sycamore|',
    's:https|h:com|h:professor|h:www|p:augustine|p:sycamore|'
]
report = traph.create_webentity(professor_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)

print ' - Simulate page crawls with links to the list of target pages'

use_index_batch_crawl = True

if use_index_batch_crawl:
    data = {}
    for i in range(len(SOURCE_PAGES)):
        lru = SOURCE_PAGES[i]

        # build links
        links = []
        for j in range(len(TARGET_PAGES)):
            if j % 4 == i:
Esempio n. 2
0
webentity_store.data['webentities'] = {}

# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

print '\n:: Setup'

print '- Create a "Twitter" webentity with the 4 prefix variations (WWW and HTTPS cases)'
twitter_prefixes = [
    's:http|h:com|h:twitter|', 's:http|h:com|h:twitter|h:www|',
    's:https|h:com|h:twitter|', 's:https|h:com|h:twitter|h:www|'
]
report = traph.create_webentity(twitter_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)
twitter_weid = report.created_webentities.keys()[0]  # Used below

print '- Create a "Ego" webentity with ego.com (4 prefixes) as well as a Twitter account (additional 4 prefixes)'
ego_prefixes = [
    's:http|h:com|h:ego|', 's:http|h:com|h:ego|h:www|', 's:https|h:com|h:ego|',
    's:https|h:com|h:ego|h:www|', 's:http|h:com|h:twitter|p:ego',
    's:http|h:com|h:twitter|h:www|p:ego', 's:https|h:com|h:twitter|p:ego',
    's:https|h:com|h:twitter|h:www|p:ego'
]
report = traph.create_webentity(ego_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)
ego_weid = report.created_webentities.keys()[0]  # Used below

print '- Create a "Cheese" webentity with cheese.ego.com, Tweets about cheese  and cheese.fr (12 prefixes)'
Esempio n. 3
0
# Instanciate the traph
traph = Traph(overwrite=True,
              folder='./scripts/data/',
              default_webentity_creation_rule=default_webentity_creation_rule,
              webentity_creation_rules=webentity_creation_rules)

# Step 1
print '\n:: Step 1 - Create a "Boeing" webentity with the 4 prefix variations (WWW and HTTPS cases).'
print 'Expected: Creates the entity with the 4 prefixes. This is the typical use case.'

boeing_prefixes = [
    's:http|h:com|h:boeing|', 's:http|h:com|h:boeing|h:www|',
    's:https|h:com|h:boeing|', 's:https|h:com|h:boeing|h:www|'
]
report = traph.create_webentity(boeing_prefixes)
webentity_store.data['webentities'].update(report.created_webentities)
boeing_weid = report.created_webentities.keys()[0]  # Used for a step below

print '\nResult - Existing webentities from Store:'
for weid, prefixes in webentity_store.data['webentities'].items():
    print ' - Webentity %s:' % (weid)
    for prefix in prefixes:
        print '\t\t' + prefix

print '\nResult - Prefixes from Traph:'
for node, lru in traph.webentity_prefix_iter():
    print ' - (%s) \t%s' % (node.webentity(), lru)

# Step 2
print '\n:: Step 2 - Create a "Airbus HTTPS" webentity with only 2 prefix variations (WWW case).'