def run(self): global threads docs = get_docs(CORPUS_PATH) mapper = Mapper() i = 1 for files in self.__chunk(docs): doc_contents = [] for f in files: with open(f, 'r') as d: doc_contents.append(d.read()) doc_contents = ''.join(doc_contents) self.__run_batch(parse(doc_contents), i, mapper) i += 1 print "Writing the mapper to file -------------------------------------" mapper.write(self.name) print "Writing DocLengths to file --------------------------------------" self.__writeDocLengths() while len(self.catalogs) != 1: print self.chunk_catalog() for pair in self.chunk_catalog(): print pair if len(pair) != 2: break else: cat1 = self.catalogs[pair[0]] cat2 = self.catalogs[pair[1]] self.__merge(cat1, cat2, pair) print "Writing the catalog to file for later use -----------------------" Catalog.write(self.catalogs, self.name)
def create_links_map(self, links_index, links_type): mapper = Mapper() # query scroll scroll = self.es.search(index=links_index, doc_type=links_type, scroll='10m', size=10000, body={"query": { "match_all": {} }}) scroll_size = scroll['hits']['total'] size = 0 # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: src_link = hit['_source']['SRC_LINK'] dst_link = hit['_source']['DST_LINK'] mapper.map(src_link) mapper.map(dst_link) # update scroll size scroll_size = len(scroll['hits']['hits']) size += scroll_size print "scrolled %s \n" % size # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') mapper.write(MAPPINGS_PATH)