def execute_page_rank(url_set, graph_index, graph_type, out_file, reverse_map=False): es_util = ElasticSearchUtility() web_graph = es_util.get_web_graph(graph_index, graph_type) page_rank_dict = page_rank(url_set, web_graph) # clear memory web_graph = None sorted_tuples = sorted(page_rank_dict.items(), key=lambda x: x[1], reverse=True)[:OUTPUT_SIZE] # clear memory page_rank_dict = None if reverse_map: print 'getting reverse url map...' url_reverse_map = Mapper.fromFile(MAPPING_FILE_NAME, reverse=True).mappings decoded_tuples = [] for t in sorted_tuples: decoded_url = url_reverse_map[t[0]] # decode url score = t[1] # score as it is decoded_tuple = (decoded_url, score) decoded_tuples.append(decoded_tuple) else: decoded_tuples = sorted_tuples print 'writing pagerank results...' write(out_file, decoded_tuples)
def execute_hits(crawl_index_name, crawl_index_type, graph_index, graph_type): es_util = ElasticSearchUtility() web_graph = es_util.get_web_graph(graph_index, graph_type) link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings hubs, authorities = hits(crawl_index_name, crawl_index_type, web_graph, QUERY_STRING, link_map) # clear memory web_graph = None link_map = None print 'sorting hubs...' sorted_hubs = sorted(hubs.items(), key=lambda x: x[1], reverse=True)[:OUTPUT_SIZE] # clear memory hubs = None print 'sorting authorities...' sorted_auth = sorted(authorities.items(), key=lambda x: x[1], reverse=True)[:OUTPUT_SIZE] # clear memory authorities = None print 'getting reverse url map...' url_reverse_map = Mapper.fromFile(MAPPING_FILE_NAME, reverse=True).mappings sorted_hubs_decoded = [] for t in sorted_hubs: decoded_url = url_reverse_map[t[0]] # decode url score = t[1] # score as it is decoded_tuple = (decoded_url, score) sorted_hubs_decoded.append(decoded_tuple) sorted_auth_decoded = [] for t in sorted_auth: decoded_url = url_reverse_map[t[0]] # decode url score = t[1] # score as it is decoded_tuple = (decoded_url, score) sorted_auth_decoded.append(decoded_tuple) print 'writing hubs...' write(HUBS_PATH, sorted_hubs_decoded) print 'writing authorities...' write(AUTH_PATH, sorted_auth_decoded)
def create_encoded_graph(): es_util = ElasticSearchUtility() # mapper = Mapper() es_util.create_index(WEB_GRAPH_INDEX, CREATE_WEB_GRAPH) # es_util.create_index(ENCODED_LINKS_INDEX, CREATE_ENCODED_LINKS) # es_util.create_links_map(LINKS_INDEX, LINKS_TYPE) # mapper = None link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings # es_util.encode_crawled_links(LINKS_INDEX, LINKS_TYPE, link_map, ENCODED_LINKS_INDEX, ENCODED_LINKS_TYPE) es_util.encoded_links_to_web_graph(LINKS_INDEX, LINKS_TYPE, WEB_GRAPH_INDEX, WEB_GRAPH_TYPE, link_map)
def get_all_ids(self, index_name, index_type): """ Returns all ids of given index :param index_name: Name of the index :param index_type: Type of the index :return: List of ids of entire index """ # query scroll id_list = [] link_map = Mapper.fromFile(MAPPING_FILE_NAME).mappings scroll = self.es.search(index=index_name, doc_type=index_type, scroll='10m', size=10000, fields=['_id'], body={"query": { "match_all": {} }}) scroll_size = scroll['hits']['total'] size = 0 # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: url = hit['_id'] encoded_id = link_map[iri_to_uri(url)] id_list.append(encoded_id) # update scroll size scroll_size = len(scroll['hits']['hits']) size += scroll_size print "scrolled %s \n" % size # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') return id_list
def __bootstrap(self): self.mapper = Mapper.fromFile(self.index) self.catalog = Catalog.fromFile(self.index) self.reader = InvertedIndexReader( self.catalog, '/Users/admin/Documents/CS6200/HW2/Index/Indices/')
{ 'id': (outlinks, inlinks) } ''' transformed_hits = {} for hit in hits: ID = hit['_id'] outlinks = hit['_source']['outlinks'] inlinks = hit['_source']['inlinks'] transformed_hits[ID] = (outlinks, inlinks) return transformed_hits def __merge_dicts(self, dict1, dict2): merged = dict1.copy() merged.update(dict2) return merged if __name__ == '__main__': client = ESClient() mapper = Mapper.fromFile('link_map') pages = client.getCrawledPages(mapper) print "Writing to file" with open('/Users/admin/Documents/CS6200/HW4/pages', 'w') as f: for page in pages: f.write(page + '\n') print "DONE!"