class TraphServerFactory(Factory): default_WECR = '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))' WECRs = { 's:http|h:com|h:world|': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})' } def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None): self.traph_dir = traph_dir self.corpus = corpus if not os.path.isdir(self.traph_dir): os.makedirs(self.traph_dir) self.traph = Traph( folder=os.path.join(self.traph_dir, corpus), default_webentity_creation_rule=default_WECR or self.default_WECR, webentity_creation_rules=WECRs or self.WECRs ) def ready(self): # stdin message received by childprocess to know when traph is ready print "READY" def buildProtocol(self, addr): return TraphProtocol(self.traph) def close(self): self.traph.close()
class TraphServerFactory(Factory): default_WECR = '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))' WECRs = { 's:http|h:com|h:world|': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})' } def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None): self.traph_dir = traph_dir self.corpus = corpus if not os.path.isdir(self.traph_dir): os.makedirs(self.traph_dir) self.traph = Traph(folder=os.path.join(self.traph_dir, corpus), default_webentity_creation_rule=default_WECR or self.default_WECR, webentity_creation_rules=WECRs or self.WECRs) def ready(self): # stdin message received by childprocess to know when traph is ready print "READY" def buildProtocol(self, addr): return TraphProtocol(self.traph) def close(self): self.traph.close()
def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None): self.traph_dir = traph_dir self.corpus = corpus if not os.path.isdir(self.traph_dir): os.makedirs(self.traph_dir) self.traph = Traph(folder=os.path.join(self.traph_dir, corpus), default_webentity_creation_rule=default_WECR or self.default_WECR, webentity_creation_rules=WECRs or self.WECRs)
def main(): """Место, где запускается программа""" data = Data.get_csv_data() if len(data) < 2: print('Неверый CSV файл') return if any([len(row) <= 13 for row in data]): print('Не все колонки имеются') return for i, row in enumerate(data[1:], start=2): # Заменить все кавычки на елочки и убрать лишние пробелы row = [replace_quotes(string.strip()) for string in row] kwargs = { 'name': row[0], # Полное имя 'short_name': row[1], # Наименование 'what': row[2], # Что за печенье 'netto': row[3], # Масса нетто 'brutto': row[4], # Масса брутто 'proteins': row[5], # Белки 'fats': row[6], # Жиры 'carbohydrates': row[7], # Углеводы 'kkal': row[8], # Килокалории 'tu': row[9], # TU 'category': row[10], # Категория 'composition': row[11], # Состав 'shelf_life': row[12], # Срок годности 'code': row[13], # Штрих код } if len(row) > 14: kwargs['bold_text'] = row[14] # Текст жирным if len(row) > 15: kwargs['top_image'] = row[15] # Картинка сверху if len(kwargs['category']) == 0: print('Категория на строке {} не заполнена.'.format(i)) continue traph = Traph(**kwargs) traph.make_traph() # Очистим память del traph print('Программа завершила свою работу!')
def __init__(self, corpus, traph_dir="traph-data", default_WECR=None, WECRs=None): self.traph_dir = traph_dir self.corpus = corpus if not os.path.isdir(self.traph_dir): os.makedirs(self.traph_dir) self.traph = Traph( folder=os.path.join(self.traph_dir, corpus), default_webentity_creation_rule=default_WECR or self.default_WECR, webentity_creation_rules=WECRs or self.WECRs )
def get_traph(self, **kwargs): options = { 'overwrite': False, 'default_webentity_creation_rule': DEFAULT_WEBENTITY_CREATION_RULE, 'webentity_creation_rules': WEBENTITY_CREATION_RULES, 'folder': self.folder } options.update(kwargs) return Traph(**options)
'(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))', 'path1': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})', 'path2': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})' } default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = { 's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'], } # Creating the Traph traph = Traph(folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) webentities_network = traph.get_webentities_links() # g = nx.Graph() # for source, targets in webentities_network.items(): # g.add_node(source, label=source) # for target in targets: # g.add_node(target, label=target) # g.add_edge(source, target) # nx.write_gexf(g, './scripts/data/dump.gexf')
'path2': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})' } default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = { 's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'], 's:http|h:com|h:facebook|': webentity_creation_rules_regexp['path1'], 's:http|h:com|h:linkedin|': webentity_creation_rules_regexp['path2'] } webentity_store = WebEntityStore('./scripts/data/webentities.json') traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) trie = traph.lru_trie links = traph.link_store print trie.header print links.header for page in PAGES: traph.add_page(page) traph.add_links(LINKS) for source_lru, target_lru in traph.links_iter(): print 'Source: %s, Target: %s' % (source_lru, target_lru)
} default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Generate random pages pages_count = 100 print '\n:: Generate %s lorem-ipsum-based pages' % (pages_count) voc = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'hodor', 'consectetur'] path_sizes = [1, 2, 3] for i in range(pages_count): path_size = random.choice(path_sizes) protocol = 's:http|' tld = 'h:com|' host = 'h:%s|' % (random.choice(voc)) path = '' for p in range(path_size): path += 'p:%s|' % (random.choice(voc))
} default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Setup' print '- Create a "Twitter" webentity with the 4 prefix variations (WWW and HTTPS cases)' twitter_prefixes = [ 's:http|h:com|h:twitter|', 's:http|h:com|h:twitter|h:www|', 's:https|h:com|h:twitter|', 's:https|h:com|h:twitter|h:www|' ] report = traph.create_webentity(twitter_prefixes) webentity_store.data['webentities'].update(report.created_webentities) twitter_weid = report.created_webentities.keys()[0] # Used below print '- Create a "Ego" webentity with ego.com (4 prefixes) as well as a Twitter account (additional 4 prefixes)' ego_prefixes = [
from traph import Traph traph = Traph(folder='./', debug=True) trie = traph.lru_trie link_store = traph.link_store euronews_id = 342 euronews_prefixes = [ 's:https|h:com|h:euronews|h:fr|', 's:http|h:com|h:euronews|h:fr|', 's:http|h:com|h:euronews|h:fr|h:www|', 's:https|h:com|h:euronews|h:fr|h:www|' ] linked_ids = set([96, 98, 299, 315]) def links_iter(weid, prefixes): for prefix in prefixes: starting_node = trie.lru_node(prefix) target_node = trie.node() for node, lru in trie.webentity_dfs_iter(starting_node, prefix): if not node.is_page(): continue if node.has_outlinks(): links_block = node.outlinks() for link_node in link_store.link_nodes_iter(links_block):
} default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Step 1 print '\n:: Step 1 - Create a "Boeing" webentity with the 4 prefix variations (WWW and HTTPS cases).' print 'Expected: Creates the entity with the 4 prefixes. This is the typical use case.' boeing_prefixes = [ 's:http|h:com|h:boeing|', 's:http|h:com|h:boeing|h:www|', 's:https|h:com|h:boeing|', 's:https|h:com|h:boeing|h:www|' ] report = traph.create_webentity(boeing_prefixes) webentity_store.data['webentities'].update(report.created_webentities) boeing_weid = report.created_webentities.keys()[0] # Used for a step below print '\nResult - Existing webentities from Store:'
# Instanciate traph with a custom rule: split after 'world' (continents) print '\n"Continents" rule given at traph init (continents should be entities)' webentity_creation_rules = { 's:http|h:com|h:world|': webentity_creation_rules_regexp['path1'], } # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Step 1 print '\n:: Step 1: Add the "Madrid" page' print 'Expected: "Europe" webentity created (matching the rule given at init), "World" not created' report = traph.add_page('s:http|h:com|h:world|p:europe|p:spain|p:madrid|') webentity_store.data['webentities'].update(report.created_webentities) print '\nResult - Existing webentities:' for weid, prefixes in webentity_store.data['webentities'].items(): print ' - Webentity %s\t%s + %s other prefixes' % (weid, prefixes[0], len(prefixes) - 1) # Step 2
TRAPH_FOLDER = './sample-traph' OUPUT = './youtube-inlinks.csv' YOUTUBE_LRUS = [ 's:http|h:com|h:youtube|', 's:https|h:com|h:youtube|', 's:http|h:com|h:youtube|h:www|', 's:https|h:com|h:youtube|h:www|', 's:http|h:com|h:googleapis|h:youtube|', 's:https|h:com|h:googleapis|h:youtube|', 's:http|h:com|h:googleapis|h:youtube|h:www|', 's:https|h:com|h:googleapis|h:youtube|h:www|', 's:http|h:be|h:youtu|', 's:https|h:be|h:youtu|', 's:http|h:be|h:youtu|h:www|', 's:https|h:be|h:youtu|h:www|' ] traph = Traph(folder=TRAPH_FOLDER, debug=True) def windup_lru(block): node = traph.lru_trie.node(block=block) lru = node.stem() webentity = node.webentity() if node.has_webentity() else None for parent in traph.lru_trie.node_parents_iter(node): lru = parent.stem() + lru if webentity is None and parent.has_webentity(): webentity = parent.webentity() return lru, webentity
print 'Custom creation rules: "Hodor" is path-2 platform and "Lorem Ipsum" is path-1 platform' webentity_creation_rules = { 's:http|h:com|h:hodor|': webentity_creation_rules_regexp['path2'], 's:http|h:com|h:lorem|h:ipsum|': webentity_creation_rules_regexp['path1'], } # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # LRU generation process def random_lru(voc, domain_sizes, path_sizes): host_size = random.choice(domain_sizes) path_size = random.choice(path_sizes) protocol = 's:http|' tld = 'h:com|' host = '' for h in range(host_size): host += 'h:%s|' % (random.choice(voc)) path = '' for p in range(path_size): path += 'p:%s|' % (random.choice(voc))
'(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|))', 'path1': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){1})', 'path2': '(s:[a-zA-Z]+\\|(t:[0-9]+\\|)?(h:[^\\|]+\\|(h:[^\\|]+\\|)+|h:(localhost|(\\d{1,3}\\.){3}\\d{1,3}|\\[[\\da-f]*:[\\da-f:]*\\])\\|)(p:[^\\|]+\\|){2})' } default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = { 's:http|h:com|h:twitter|': webentity_creation_rules_regexp['path1'], } # Creating the Traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) # Reading from mongo client = MongoClient(MONGO['host'], MONGO['port']) collection = client[MONGO['db']][MONGO['collection']] def links_generator(data): source = data['lru'] for target in data['lrulinks']: yield source, target links_multimap = defaultdict(list)
} default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Store network...' use_index_batch_crawl = True if use_index_batch_crawl: data = {} for source_lru, target_lru in LINKS: if source_lru in data: links = data[source_lru] else: links = [] links.append(target_lru) data[source_lru] = links
} default_webentity_creation_rule = webentity_creation_rules_regexp['domain'] webentity_creation_rules = {} # Webentity store is necessary to keep track of web entities' prefixes. # Though the traph could retrieve them, it would not be efficient. # In a real situation, these would be tracked elsewhere. # That's what we are simulating with this store. webentity_store = WebEntityStore('./scripts/data/webentities.json') webentity_store.data['webentities'] = {} # Instanciate the traph traph = Traph(overwrite=True, folder='./scripts/data/', default_webentity_creation_rule=default_webentity_creation_rule, webentity_creation_rules=webentity_creation_rules) print '\n:: Simulate a crawl:' print ' - Create webentity for "s:http|h:com|h:professor|p:augustine|p:sycamore|"' professor_prefixes = [ 's:http|h:com|h:professor|p:augustine|p:sycamore|', 's:http|h:com|h:professor|h:www|p:augustine|p:sycamore|', 's:https|h:com|h:professor|p:augustine|p:sycamore|', 's:https|h:com|h:professor|h:www|p:augustine|p:sycamore|' ] report = traph.create_webentity(professor_prefixes) webentity_store.data['webentities'].update(report.created_webentities) print ' - Simulate page crawls with links to the list of target pages'