Python RecordDAWG.save Beispiele

Programmiersprache: Python

Namespace / Paketname: dawg

Klasse / Typ: RecordDAWG

Methode / Funktion: save

Beispiele auf hotexamples.com: 2

Python RecordDAWG.save - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die dawg.RecordDAWG.save, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

RecordDAWG(8)

load(3)

save(2)

write(2)

keys(1)

Beispiel #1

Datei anzeigen

Datei: IndexCreator.py Projekt: antonia69/InformationRetrieval-en.people.cn

class IndexCreator():
    def __init__(self, directory):
        self.directory = directory
        assert (os.path.isfile(f'{self.directory}/comments.csv'))
        sys.setrecursionlimit(10000)
        self.report = Report(quiet_mode=False)

    def create_index(self):
        # read csv to create comment_list

        with self.report.measure('processing comments.csv'):
            number_of_processes = min(os.cpu_count(), 2)
            print(f'starting {number_of_processes} processes')
            csv_size = os.stat(f'{self.directory}/comments.csv').st_size
            with multiprocessing.Pool(processes=number_of_processes) as pool:
                offsets = [0]
                with open(f'{self.directory}/comments.csv', mode='rb') as f:
                    for i in range(1, number_of_processes + 1):
                        f.seek(int(i * csv_size / number_of_processes))
                        f.readline()
                        next_offset = f.tell()
                        if next_offset != offsets[-1]:
                            offsets.append(next_offset)

                def on_error(exception):
                    raise exception

                for start_offset, end_offset in zip(offsets, offsets[1:]):
                    pool.apply_async(process_comments_file,
                                     args=(self.directory, start_offset,
                                           end_offset),
                                     error_callback=on_error)
                pool.close()
                pool.join()

            self.partial_index_names = []
            reply_to_index = {}
            cid_to_offset = {}
            for end_offset in offsets[1:]:
                file_number_path = \
                    f'{self.directory}/{end_offset}_file_number.pickle'
                with open(file_number_path, mode='rb') as f:
                    file_number = pickle.load(f)
                    for i in range(file_number):
                        self.partial_index_names.append(
                            f'{self.directory}/{end_offset}_{i}')
                os.remove(file_number_path)

                reply_to_index_part_path = f'{self.directory}/' \
                    f'{end_offset}_reply_to_index.pickle'
                with open(reply_to_index_part_path, mode='rb') as f:
                    reply_to_index_part = pickle.load(f)
                    for key, value in reply_to_index_part.items():
                        if key not in reply_to_index.keys():
                            reply_to_index[key] = value
                        else:
                            reply_to_index[key].extend(value)
                os.remove(reply_to_index_part_path)

                cid_to_offset_part_path = f'{self.directory}/' \
                    f'{end_offset}_cid_to_offset.pickle'
                with open(cid_to_offset_part_path, mode='rb') as f:
                    cid_to_offset_part = pickle.load(f)
                    cid_to_offset.update(cid_to_offset_part)
                os.remove(cid_to_offset_part_path)

            with open(f'{self.directory}/reply_to_index.pickle',
                      mode='wb') as f:
                pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL)

            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(cid_to_offset.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int64(cid_to_offset[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/cids.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2)

        # merge indices
        with self.report.measure('merging index'):
            # comment term counts
            self.comment_term_count_dict = {}
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_comment_term_count_dict.pickle'
                with open(file_path, mode='rb') as f:
                    self.comment_term_count_dict.update(pickle.load(f))
                os.remove(file_path)
            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(self.comment_term_count_dict.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int32(self.comment_term_count_dict[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/comment_offsets.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2)

            # collection term count
            self.collection_term_count = 0
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_collection_term_count.pickle'
                with open(file_path, mode='rb') as f:
                    self.collection_term_count += pickle.load(f)
                os.remove(file_path)

            with open(f'{self.directory}/collection_term_count.pickle',
                      mode='wb') as f:
                pickle.dump(self.collection_term_count, f,
                            pickle.HIGHEST_PROTOCOL)

            # index
            index_files = []
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                index_files.append(open(file_path, mode='rb'))

            current_terms = []
            current_meta = []
            current_posting_lists = []
            global_active_indices = []
            global_active_file_count = 0

            for file in index_files:
                line = file.readline().decode('utf-8').rstrip('\n').split(
                    posting_list_separator, 2)
                current_terms.append(line[0])
                current_meta.append(int(line[1]))
                current_posting_lists.append(line[2])
                global_active_indices.append(True)
                global_active_file_count += 1

            current_active_indices = []
            current_min_term = None
            self.seek_list = []
            current_offset = 0
            terms_done = 0

            with open(f'{self.directory}/index.csv', mode='wb') as f:
                while global_active_file_count > 0:
                    # find next term to write
                    for key, term in enumerate(current_terms):
                        if not global_active_indices[key]:
                            continue
                        if current_min_term is None or term < current_min_term:
                            current_active_indices = [key]
                            current_min_term = term
                        elif term == current_min_term:
                            current_active_indices.append(key)

                    # merge all lines containing term

                    if len(current_min_term) <= 128:
                        meta = 0
                        for key in current_active_indices:
                            meta += current_meta[key]

                        line_string = \
                            f'{current_min_term}{posting_list_separator}{meta}'
                        for key in current_active_indices:
                            line_string += f'{posting_list_separator}' \
                                f'{current_posting_lists[key]}'

                        line_string += '\n'
                        line_raw = line_string.encode()
                        f.write(line_raw)
                        term = current_min_term[1:-1].replace('""', '"')
                        self.seek_list.append((term, [current_offset]))
                        current_offset += len(line_raw)

                    # reload lines where necessary
                    for key in current_active_indices:
                        linetest = index_files[key].readline().decode('utf-8')
                        if linetest == '':
                            # end of file
                            global_active_indices[key] = False
                            global_active_file_count -= 1
                            print('one file out, '
                                  f'{global_active_file_count} remaining')
                        else:
                            line = linetest.rstrip('\n').split(
                                posting_list_separator, 2)
                            current_terms[key] = line[0]
                            current_meta[key] = int(line[1])
                            current_posting_lists[key] = line[2]

                    current_min_term = None
                    current_active_indices = []
                    terms_done += 1
                    if terms_done % 100000 == 0:
                        print(f'Merged {terms_done} terms.')

            self.seek_list = RecordDAWG('>Q', self.seek_list)
            self.seek_list.save(f'{self.directory}/seek_list.dawg')

            for f in index_files:
                f.close()

            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                os.remove(file_path)

        self.huffman_compression(generate_encoding=False)

        with self.report.measure('processing authors & articles'):
            with open(f'{self.directory}/authors_list.pickle', mode='wb') as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/authors.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

            with open(f'{self.directory}/articles_list.pickle', mode='wb') \
                    as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/articles.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

    def huffman_compression(self, generate_encoding=False):
        # compress using Huffman encoding
        symbol_to_encoding_dict = {}

        # count all occuring UTF-8 characters
        if generate_encoding:
            symbol_to_frequency_dict = Counter()
            with self.report.measure('counting utf8 characters'):
                with open(f'{self.directory}/index.csv') as index_file:
                    chunk_size = 100000

                    def next_chunk_generator():
                        chunk = index_file.read(chunk_size)
                        while chunk:
                            yield chunk
                            chunk = index_file.read(chunk_size)

                    for i, chunk in enumerate(next_chunk_generator(), 1):
                        symbol_to_frequency_dict.update(Counter(chunk))
                        self.report.progress(
                            i, f' chunks counted ({chunk_size} characters '
                            'each)', 100)
                if '\n' in symbol_to_frequency_dict.keys():
                    del symbol_to_frequency_dict['\n']

            # derive huffman encoding from character counts
            with self.report.measure('deriving huffman encoding'):
                symbol_to_encoding_dict = Huffman.derive_encoding(
                    symbol_to_frequency_dict)
            for key, value in symbol_to_encoding_dict.items():
                assert (len(key) == 1)
                symbol_to_encoding_list[ord(key[0])] = value
            with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                      mode='wb') as f:
                pickle.dump(symbol_to_encoding_dict, f,
                            pickle.HIGHEST_PROTOCOL)
        else:
            # optimal encoding for guardian
            # character distribution should be similar for all datasets
            symbol_to_encoding_dict = {
                '\a': BitArray('1111'),
                ',': BitArray('001'),
                '0': BitArray('1000'),
                '1': BitArray('011'),
                '2': BitArray('010'),
                '3': BitArray('000'),
                '4': BitArray('1110'),
                '5': BitArray('1101'),
                '6': BitArray('1100'),
                '7': BitArray('1011'),
                '8': BitArray('1010'),
                '9': BitArray('1001')
            }

        with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                  mode='wb') as f:
            pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL)

        # save compressed index and corresponding seek_list
        with self.report.measure('saving compressed files'):
            self.compressed_seek_list = []
            with open(f'{self.directory}/compressed_index', mode='wb') \
                    as compressed_index_file:
                offset = 0
                for i, orig_line in enumerate(
                        binary_read_line_generator_path(
                            f'{self.directory}/index.csv'), 1):
                    term = next(
                        csv.reader(io.StringIO(orig_line),
                                   delimiter=posting_list_separator))[0]
                    line_without_term = orig_line[len(term) + 3:]
                    encoded_line = Huffman.encode(line_without_term,
                                                  symbol_to_encoding_dict)
                    compressed_index_file.write(encoded_line)

                    self.compressed_seek_list.append(
                        (term, (offset, len(encoded_line))))

                    self.report.progress(i, ' index lines compressed', 100000)

                    offset += len(encoded_line)
            self.compressed_seek_list = \
                RecordDAWG('>QQ', self.compressed_seek_list)
            self.compressed_seek_list.save(
                f'{self.directory}/compressed_seek_list.dawg')

Beispiel #2

Datei anzeigen

Datei: gen_data.py Projekt: nikkollaii/automatic-text-processing

    all_forms = [norm] + forms
    for form in all_forms:
        pr, sf = split(form, stem)

        prefixes.append(get_index(pr, ALL_PREFIXES))
        suffixes.append(get_index(sf, ALL_SUFFIXES))
        tags.append(get_index(ps, ALL_TAGS))
    # scheme = array.array('H', prefixes + suffixes + tags)
    scheme = prefixes + suffixes + tags
    if scheme not in ALL_SCHEMES:
        ALL_SCHEMES.append(scheme)
    scheme_id = ALL_SCHEMES.index(scheme)

    for i, form in enumerate(all_forms):
        ALL_MAP.append((form, (scheme_id, i)))

record_dawg = RecordDAWG(u">II", ALL_MAP)
record_dawg.save('words.dawg')

with open('ALL_PREFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_PREFIXES, fp, ensure_ascii=False)

with open('ALL_SUFFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SUFFIXES, fp, ensure_ascii=False)

with open('ALL_TAGS.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_TAGS, fp, ensure_ascii=False)

with open('ALL_SCHEMES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SCHEMES, fp, ensure_ascii=False)