'(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))', 'path1': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})', 'path2': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})' } default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Store network...' use_index_batch_crawl = True if use_index_batch_crawl: data = {} for source_lru, target_lru in LINKS:
'(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))', 'path1': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})', 'path2': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})' } default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = { 's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'], 's:http|h:com|h:facebook|': webentity_creation_rules_regexp['path1'], 's:http|h:com|h:linkedin|': webentity_creation_rules_regexp['path2'] } webentity_store = WebEntityStore('./scripts/data/webentities.json') traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) trie = traph.lru_trie links = traph.link_store print trie.header print links.header for page in PAGES: traph.add_page(page) traph.add_links(LINKS)