def prepare_matches(chunk_fname, keys_out_fname, values_out_fname, pid=""): keys_out = open(keys_out_fname, "wb", buffering=1000000) values_out = open(values_out_fname, "wb", buffering=1000000) add_values_out = open(values_out_fname + ".add", "w", buffering=1000000) all_codes = [] tokens = {} tokens_freqs = {} for line in open(chunk_fname): token, token_codes = line.strip().split("\t") token_codes = [int(code) for code in token_codes.split()] tokens.setdefault(token, []).append(len(all_codes)) all_codes.append(token_codes) tokens_freqs.setdefault(token, 0) tokens_freqs[token] += len(token_codes) progress_counter = TCustomCounter("Reducer%s" % (str(pid)), sys.stdout, verbosity=1, interval=10000) for token, chunks in tokens.items(): token_freq = tokens_freqs[token] if token_freq < MIN_WORD_FREQ_FOR_INDEX or token_freq > MAX_WORD_FREQ_FOR_INDEX: continue word_codes = [] for chunk_index in chunks: word_codes += all_codes[chunk_index] all_codes[chunk_index] = [] word_codes.sort() start_position = values_out.tell() for code in word_codes: values_out.write(pack("Q", code)) pickle.dump((token, token_freq, start_position), keys_out) progress_counter.add()
def crawl_folder(self, folder): object_folders = crawl_folder(folder) import sys processed_counter = TCustomCounter("Crawler, found objects", sys.stderr, self.verbosity, 100) for object_folder, object_id in object_folders: fields2update = self.crawl_object_fields(object_folder, object_id) object2update = TIndexingObjectData(object_id=object_id, object_fields=fields2update) yield object2update processed_counter.add()
def crawl_csv(self, csv_file_path): field_index2name = {1:"year", 2:"udc", #3:"class_level1", #4:"class_level2", #5:"class_level3", 6:"pages_count", 7: "author", 8:"title" } hierarchy_indices = [3, 4, 5] import sys processed_counter = TCustomCounter("Crawler, found objects", sys.stderr, self.verbosity, 1000) encoding = chardet.detect(open(csv_file_path).read())['encoding'] all_hierarchy_codes = {} for line in open(csv_file_path): line = line.decode(encoding) field_values = line.strip().split(";") object_id = field_values[0] fields = [] for field_index, field_id in field_index2name.items(): if len(field_values) > field_index: field_value_encoded = field_values[field_index].encode(DEFAULT_ENCODING) fields.append(TIndexingObjectField(field_id, field_value=field_value_encoded, field_file_path="")) """ library section feature """ hierarchy_codes = [] import hashlib hash = hashlib.md5() path = "" for hierarchy_feat_index in hierarchy_indices: node_name = field_values[hierarchy_feat_index].strip() if not node_name: break hash.update(node_name.encode("utf8")) code = int(hash.hexdigest(), 16) % 1000000007 path += node_name + ";" hierarchy_codes.append(code) if not code in all_hierarchy_codes: all_hierarchy_codes[code] = path elif code in all_hierarchy_codes and all_hierarchy_codes[code] != path: print "Hash collision:", path.encode("utf8"), "vs.", all_hierarchy_codes[code].encode("utf8") print "FULL STOP" exit() fields.append(TIndexingObjectField(field_id=LIB_SECTION_FIELD, field_value=hierarchy_codes, field_file_path="")) object2update = TIndexingObjectData(object_id=object_id, object_fields=fields) yield object2update processed_counter.add() """