class IndexCreator(): def __init__(self, directory): self.directory = directory assert (os.path.isfile(f'{self.directory}/comments.csv')) sys.setrecursionlimit(10000) self.report = Report(quiet_mode=False) def create_index(self): # read csv to create comment_list with self.report.measure('processing comments.csv'): number_of_processes = min(os.cpu_count(), 2) print(f'starting {number_of_processes} processes') csv_size = os.stat(f'{self.directory}/comments.csv').st_size with multiprocessing.Pool(processes=number_of_processes) as pool: offsets = [0] with open(f'{self.directory}/comments.csv', mode='rb') as f: for i in range(1, number_of_processes + 1): f.seek(int(i * csv_size / number_of_processes)) f.readline() next_offset = f.tell() if next_offset != offsets[-1]: offsets.append(next_offset) def on_error(exception): raise exception for start_offset, end_offset in zip(offsets, offsets[1:]): pool.apply_async(process_comments_file, args=(self.directory, start_offset, end_offset), error_callback=on_error) pool.close() pool.join() self.partial_index_names = [] reply_to_index = {} cid_to_offset = {} for end_offset in offsets[1:]: file_number_path = \ f'{self.directory}/{end_offset}_file_number.pickle' with open(file_number_path, mode='rb') as f: file_number = pickle.load(f) for i in range(file_number): self.partial_index_names.append( f'{self.directory}/{end_offset}_{i}') os.remove(file_number_path) reply_to_index_part_path = f'{self.directory}/' \ f'{end_offset}_reply_to_index.pickle' with open(reply_to_index_part_path, mode='rb') as f: reply_to_index_part = pickle.load(f) for key, value in reply_to_index_part.items(): if key not in reply_to_index.keys(): reply_to_index[key] = value else: reply_to_index[key].extend(value) os.remove(reply_to_index_part_path) cid_to_offset_part_path = f'{self.directory}/' \ f'{end_offset}_cid_to_offset.pickle' with open(cid_to_offset_part_path, mode='rb') as f: cid_to_offset_part = pickle.load(f) cid_to_offset.update(cid_to_offset_part) os.remove(cid_to_offset_part_path) with open(f'{self.directory}/reply_to_index.pickle', mode='wb') as f: pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(cid_to_offset.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int64(cid_to_offset[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/cids.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2) # merge indices with self.report.measure('merging index'): # comment term counts self.comment_term_count_dict = {} for file_prefix in self.partial_index_names: file_path = file_prefix + '_comment_term_count_dict.pickle' with open(file_path, mode='rb') as f: self.comment_term_count_dict.update(pickle.load(f)) os.remove(file_path) tempa = numpy.array([]) ret = [] ret2 = [] for key in sorted(self.comment_term_count_dict.keys()): ret.append(numpy.int64(key)) ret2.append(numpy.int32(self.comment_term_count_dict[key])) tempa = numpy.array(ret) numpy.save(f'{self.directory}/comment_offsets.npy', tempa) tempa2 = numpy.array(ret2) numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2) # collection term count self.collection_term_count = 0 for file_prefix in self.partial_index_names: file_path = file_prefix + '_collection_term_count.pickle' with open(file_path, mode='rb') as f: self.collection_term_count += pickle.load(f) os.remove(file_path) with open(f'{self.directory}/collection_term_count.pickle', mode='wb') as f: pickle.dump(self.collection_term_count, f, pickle.HIGHEST_PROTOCOL) # index index_files = [] for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' index_files.append(open(file_path, mode='rb')) current_terms = [] current_meta = [] current_posting_lists = [] global_active_indices = [] global_active_file_count = 0 for file in index_files: line = file.readline().decode('utf-8').rstrip('\n').split( posting_list_separator, 2) current_terms.append(line[0]) current_meta.append(int(line[1])) current_posting_lists.append(line[2]) global_active_indices.append(True) global_active_file_count += 1 current_active_indices = [] current_min_term = None self.seek_list = [] current_offset = 0 terms_done = 0 with open(f'{self.directory}/index.csv', mode='wb') as f: while global_active_file_count > 0: # find next term to write for key, term in enumerate(current_terms): if not global_active_indices[key]: continue if current_min_term is None or term < current_min_term: current_active_indices = [key] current_min_term = term elif term == current_min_term: current_active_indices.append(key) # merge all lines containing term if len(current_min_term) <= 128: meta = 0 for key in current_active_indices: meta += current_meta[key] line_string = \ f'{current_min_term}{posting_list_separator}{meta}' for key in current_active_indices: line_string += f'{posting_list_separator}' \ f'{current_posting_lists[key]}' line_string += '\n' line_raw = line_string.encode() f.write(line_raw) term = current_min_term[1:-1].replace('""', '"') self.seek_list.append((term, [current_offset])) current_offset += len(line_raw) # reload lines where necessary for key in current_active_indices: linetest = index_files[key].readline().decode('utf-8') if linetest == '': # end of file global_active_indices[key] = False global_active_file_count -= 1 print('one file out, ' f'{global_active_file_count} remaining') else: line = linetest.rstrip('\n').split( posting_list_separator, 2) current_terms[key] = line[0] current_meta[key] = int(line[1]) current_posting_lists[key] = line[2] current_min_term = None current_active_indices = [] terms_done += 1 if terms_done % 100000 == 0: print(f'Merged {terms_done} terms.') self.seek_list = RecordDAWG('>Q', self.seek_list) self.seek_list.save(f'{self.directory}/seek_list.dawg') for f in index_files: f.close() for file_prefix in self.partial_index_names: file_path = file_prefix + '_index.csv' os.remove(file_path) self.huffman_compression(generate_encoding=False) with self.report.measure('processing authors & articles'): with open(f'{self.directory}/authors_list.pickle', mode='wb') as f: pickle.dump( create_list_from_csv(f'{self.directory}/authors.csv'), f, pickle.HIGHEST_PROTOCOL) with open(f'{self.directory}/articles_list.pickle', mode='wb') \ as f: pickle.dump( create_list_from_csv(f'{self.directory}/articles.csv'), f, pickle.HIGHEST_PROTOCOL) def huffman_compression(self, generate_encoding=False): # compress using Huffman encoding symbol_to_encoding_dict = {} # count all occuring UTF-8 characters if generate_encoding: symbol_to_frequency_dict = Counter() with self.report.measure('counting utf8 characters'): with open(f'{self.directory}/index.csv') as index_file: chunk_size = 100000 def next_chunk_generator(): chunk = index_file.read(chunk_size) while chunk: yield chunk chunk = index_file.read(chunk_size) for i, chunk in enumerate(next_chunk_generator(), 1): symbol_to_frequency_dict.update(Counter(chunk)) self.report.progress( i, f' chunks counted ({chunk_size} characters ' 'each)', 100) if '\n' in symbol_to_frequency_dict.keys(): del symbol_to_frequency_dict['\n'] # derive huffman encoding from character counts with self.report.measure('deriving huffman encoding'): symbol_to_encoding_dict = Huffman.derive_encoding( symbol_to_frequency_dict) for key, value in symbol_to_encoding_dict.items(): assert (len(key) == 1) symbol_to_encoding_list[ord(key[0])] = value with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) else: # optimal encoding for guardian # character distribution should be similar for all datasets symbol_to_encoding_dict = { '\a': BitArray('1111'), ',': BitArray('001'), '0': BitArray('1000'), '1': BitArray('011'), '2': BitArray('010'), '3': BitArray('000'), '4': BitArray('1110'), '5': BitArray('1101'), '6': BitArray('1100'), '7': BitArray('1011'), '8': BitArray('1010'), '9': BitArray('1001') } with open(f'{self.directory}/symbol_to_encoding_dict.pickle', mode='wb') as f: pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL) # save compressed index and corresponding seek_list with self.report.measure('saving compressed files'): self.compressed_seek_list = [] with open(f'{self.directory}/compressed_index', mode='wb') \ as compressed_index_file: offset = 0 for i, orig_line in enumerate( binary_read_line_generator_path( f'{self.directory}/index.csv'), 1): term = next( csv.reader(io.StringIO(orig_line), delimiter=posting_list_separator))[0] line_without_term = orig_line[len(term) + 3:] encoded_line = Huffman.encode(line_without_term, symbol_to_encoding_dict) compressed_index_file.write(encoded_line) self.compressed_seek_list.append( (term, (offset, len(encoded_line)))) self.report.progress(i, ' index lines compressed', 100000) offset += len(encoded_line) self.compressed_seek_list = \ RecordDAWG('>QQ', self.compressed_seek_list) self.compressed_seek_list.save( f'{self.directory}/compressed_seek_list.dawg')
all_forms = [norm] + forms for form in all_forms: pr, sf = split(form, stem) prefixes.append(get_index(pr, ALL_PREFIXES)) suffixes.append(get_index(sf, ALL_SUFFIXES)) tags.append(get_index(ps, ALL_TAGS)) # scheme = array.array('H', prefixes + suffixes + tags) scheme = prefixes + suffixes + tags if scheme not in ALL_SCHEMES: ALL_SCHEMES.append(scheme) scheme_id = ALL_SCHEMES.index(scheme) for i, form in enumerate(all_forms): ALL_MAP.append((form, (scheme_id, i))) record_dawg = RecordDAWG(u">II", ALL_MAP) record_dawg.save('words.dawg') with open('ALL_PREFIXES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_PREFIXES, fp, ensure_ascii=False) with open('ALL_SUFFIXES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_SUFFIXES, fp, ensure_ascii=False) with open('ALL_TAGS.json', 'w', encoding='utf-8') as fp: json.dump(ALL_TAGS, fp, ensure_ascii=False) with open('ALL_SCHEMES.json', 'w', encoding='utf-8') as fp: json.dump(ALL_SCHEMES, fp, ensure_ascii=False)