def load_index(self, directory):
     self.seek_list = RecordDAWG('>QQ')
     self.seek_list.load(f'{directory}/compressed_seek_list.dawg')
     self.index_file = open(f'{directory}/compressed_index', mode='rb')
     with open(f'{directory}/symbol_to_encoding_dict.pickle',
               mode='rb') as f:
         self.symbol_to_encoding_dict = pickle.load(f)
     self.comment_offsets = numpy.load(
         f'{directory}/comment_offsets.npy', mmap_mode=None)
     self.comment_term_counts = numpy.load(
         f'{directory}/comment_term_counts.npy', mmap_mode=None)
     with open(f'{directory}/collection_term_count.pickle', mode='rb') as f:
         self.collection_term_count = pickle.load(f)
     self.comment_file = open(f'{directory}/comments.csv', mode='rb')
     self.comment_csv_reader = csv.reader(
         binary_read_line_generator(self.comment_file))
     with open(f'{directory}/authors_list.pickle', mode='rb') as f:
         self.authors_list = pickle.load(f)
     with open(f'{directory}/articles_list.pickle', mode='rb') as f:
         self.articles_list = pickle.load(f)
     with open(f'{directory}/reply_to_index.pickle', mode='rb') as f:
         self.reply_to_index = pickle.load(f)
     self.cids = numpy.load(f'{directory}/cids.npy', mmap_mode='r')
     self.comment_offsets_cid = numpy.load(
         f'{directory}/comment_offsets_cid.npy', mmap_mode='r')
Beispiel #2
0
def load_dict(path):
    format = ">2I"
    try:
        d = RecordDAWG(format)
        d.load(path)
        return d
    except Exception, e:
        print "load dict error:..", e.message
        return None
Beispiel #3
0
 def __init__(self, dict_path):
     reload(sys)
     sys.setdefaultencoding('utf8')
     self.dict_path = dict_path
     self.format = ">2I"
     try:
         self.dict = RecordDAWG(self.format)
         self.dict.load(dict_path)
     except Exception, e:
         print "load dict error:",dict_path, e.message
Beispiel #4
0
def build_dict(path, path_build):
    format = ">2I"
    keys = []
    values = []
    file_handler = open(path, 'rb')
    for line in file_handler:
        line = line.strip('/r/n')
        arr = line.split('\t')
        try:
            if len(arr) == 3:
                keys.append(arr[0].decode("utf-8"))
                values.append([int(arr[1]), int(arr[2])])
            else:
                continue
        except:
            continue
    data = zip(keys, values)
    record = RecordDAWG(format, data)
    with open(path_build, 'wb') as f:
        record.write(f)
Beispiel #5
0
                nameIdMap[short_name] = []
            #print short_name, region_id, region_id_online
            nameIdMap[short_name].append([region_id, region_id_online])
    return nameIdMap


if __name__ == '__main__':
    #"654226", "和布克赛尔蒙古自治县", "和布克赛尔县", "0", "312216", "65", "amqp://*****:*****@120.27.247.47:5672/%2F"
    build_file = 'region_all.dawg'

    #generate dict
    format = ">2I"
    keys = []
    values = []

    nameIdMap = get_region_list()
    for k in nameIdMap.keys():
        t = nameIdMap[k]
        for v in t:
            if v[0] and v[1]:
                keys.append(k)
                values.append([int(v[0]), int(v[1])])

    print len(values), len(keys)
    for x in range(0, len(values)):
        print keys[x], values[x]
    data = zip(keys, values)
    record = RecordDAWG(format, data)
    with open(build_file, 'wb') as f:
        record.write(f)
    def huffman_compression(self, generate_encoding=False):
        # compress using Huffman encoding
        symbol_to_encoding_dict = {}

        # count all occuring UTF-8 characters
        if generate_encoding:
            symbol_to_frequency_dict = Counter()
            with self.report.measure('counting utf8 characters'):
                with open(f'{self.directory}/index.csv') as index_file:
                    chunk_size = 100000

                    def next_chunk_generator():
                        chunk = index_file.read(chunk_size)
                        while chunk:
                            yield chunk
                            chunk = index_file.read(chunk_size)

                    for i, chunk in enumerate(next_chunk_generator(), 1):
                        symbol_to_frequency_dict.update(Counter(chunk))
                        self.report.progress(
                            i, f' chunks counted ({chunk_size} characters '
                            'each)', 100)
                if '\n' in symbol_to_frequency_dict.keys():
                    del symbol_to_frequency_dict['\n']

            # derive huffman encoding from character counts
            with self.report.measure('deriving huffman encoding'):
                symbol_to_encoding_dict = Huffman.derive_encoding(
                    symbol_to_frequency_dict)
            for key, value in symbol_to_encoding_dict.items():
                assert (len(key) == 1)
                symbol_to_encoding_list[ord(key[0])] = value
            with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                      mode='wb') as f:
                pickle.dump(symbol_to_encoding_dict, f,
                            pickle.HIGHEST_PROTOCOL)
        else:
            # optimal encoding for guardian
            # character distribution should be similar for all datasets
            symbol_to_encoding_dict = {
                '\a': BitArray('1111'),
                ',': BitArray('001'),
                '0': BitArray('1000'),
                '1': BitArray('011'),
                '2': BitArray('010'),
                '3': BitArray('000'),
                '4': BitArray('1110'),
                '5': BitArray('1101'),
                '6': BitArray('1100'),
                '7': BitArray('1011'),
                '8': BitArray('1010'),
                '9': BitArray('1001')
            }

        with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                  mode='wb') as f:
            pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL)

        # save compressed index and corresponding seek_list
        with self.report.measure('saving compressed files'):
            self.compressed_seek_list = []
            with open(f'{self.directory}/compressed_index', mode='wb') \
                    as compressed_index_file:
                offset = 0
                for i, orig_line in enumerate(
                        binary_read_line_generator_path(
                            f'{self.directory}/index.csv'), 1):
                    term = next(
                        csv.reader(io.StringIO(orig_line),
                                   delimiter=posting_list_separator))[0]
                    line_without_term = orig_line[len(term) + 3:]
                    encoded_line = Huffman.encode(line_without_term,
                                                  symbol_to_encoding_dict)
                    compressed_index_file.write(encoded_line)

                    self.compressed_seek_list.append(
                        (term, (offset, len(encoded_line))))

                    self.report.progress(i, ' index lines compressed', 100000)

                    offset += len(encoded_line)
            self.compressed_seek_list = \
                RecordDAWG('>QQ', self.compressed_seek_list)
            self.compressed_seek_list.save(
                f'{self.directory}/compressed_seek_list.dawg')
    def create_index(self):
        # read csv to create comment_list

        with self.report.measure('processing comments.csv'):
            number_of_processes = min(os.cpu_count(), 2)
            print(f'starting {number_of_processes} processes')
            csv_size = os.stat(f'{self.directory}/comments.csv').st_size
            with multiprocessing.Pool(processes=number_of_processes) as pool:
                offsets = [0]
                with open(f'{self.directory}/comments.csv', mode='rb') as f:
                    for i in range(1, number_of_processes + 1):
                        f.seek(int(i * csv_size / number_of_processes))
                        f.readline()
                        next_offset = f.tell()
                        if next_offset != offsets[-1]:
                            offsets.append(next_offset)

                def on_error(exception):
                    raise exception

                for start_offset, end_offset in zip(offsets, offsets[1:]):
                    pool.apply_async(process_comments_file,
                                     args=(self.directory, start_offset,
                                           end_offset),
                                     error_callback=on_error)
                pool.close()
                pool.join()

            self.partial_index_names = []
            reply_to_index = {}
            cid_to_offset = {}
            for end_offset in offsets[1:]:
                file_number_path = \
                    f'{self.directory}/{end_offset}_file_number.pickle'
                with open(file_number_path, mode='rb') as f:
                    file_number = pickle.load(f)
                    for i in range(file_number):
                        self.partial_index_names.append(
                            f'{self.directory}/{end_offset}_{i}')
                os.remove(file_number_path)

                reply_to_index_part_path = f'{self.directory}/' \
                    f'{end_offset}_reply_to_index.pickle'
                with open(reply_to_index_part_path, mode='rb') as f:
                    reply_to_index_part = pickle.load(f)
                    for key, value in reply_to_index_part.items():
                        if key not in reply_to_index.keys():
                            reply_to_index[key] = value
                        else:
                            reply_to_index[key].extend(value)
                os.remove(reply_to_index_part_path)

                cid_to_offset_part_path = f'{self.directory}/' \
                    f'{end_offset}_cid_to_offset.pickle'
                with open(cid_to_offset_part_path, mode='rb') as f:
                    cid_to_offset_part = pickle.load(f)
                    cid_to_offset.update(cid_to_offset_part)
                os.remove(cid_to_offset_part_path)

            with open(f'{self.directory}/reply_to_index.pickle',
                      mode='wb') as f:
                pickle.dump(reply_to_index, f, pickle.HIGHEST_PROTOCOL)

            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(cid_to_offset.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int64(cid_to_offset[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/cids.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_offsets_cid.npy', tempa2)

        # merge indices
        with self.report.measure('merging index'):
            # comment term counts
            self.comment_term_count_dict = {}
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_comment_term_count_dict.pickle'
                with open(file_path, mode='rb') as f:
                    self.comment_term_count_dict.update(pickle.load(f))
                os.remove(file_path)
            tempa = numpy.array([])
            ret = []
            ret2 = []
            for key in sorted(self.comment_term_count_dict.keys()):
                ret.append(numpy.int64(key))
                ret2.append(numpy.int32(self.comment_term_count_dict[key]))
            tempa = numpy.array(ret)
            numpy.save(f'{self.directory}/comment_offsets.npy', tempa)
            tempa2 = numpy.array(ret2)
            numpy.save(f'{self.directory}/comment_term_counts.npy', tempa2)

            # collection term count
            self.collection_term_count = 0
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_collection_term_count.pickle'
                with open(file_path, mode='rb') as f:
                    self.collection_term_count += pickle.load(f)
                os.remove(file_path)

            with open(f'{self.directory}/collection_term_count.pickle',
                      mode='wb') as f:
                pickle.dump(self.collection_term_count, f,
                            pickle.HIGHEST_PROTOCOL)

            # index
            index_files = []
            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                index_files.append(open(file_path, mode='rb'))

            current_terms = []
            current_meta = []
            current_posting_lists = []
            global_active_indices = []
            global_active_file_count = 0

            for file in index_files:
                line = file.readline().decode('utf-8').rstrip('\n').split(
                    posting_list_separator, 2)
                current_terms.append(line[0])
                current_meta.append(int(line[1]))
                current_posting_lists.append(line[2])
                global_active_indices.append(True)
                global_active_file_count += 1

            current_active_indices = []
            current_min_term = None
            self.seek_list = []
            current_offset = 0
            terms_done = 0

            with open(f'{self.directory}/index.csv', mode='wb') as f:
                while global_active_file_count > 0:
                    # find next term to write
                    for key, term in enumerate(current_terms):
                        if not global_active_indices[key]:
                            continue
                        if current_min_term is None or term < current_min_term:
                            current_active_indices = [key]
                            current_min_term = term
                        elif term == current_min_term:
                            current_active_indices.append(key)

                    # merge all lines containing term

                    if len(current_min_term) <= 128:
                        meta = 0
                        for key in current_active_indices:
                            meta += current_meta[key]

                        line_string = \
                            f'{current_min_term}{posting_list_separator}{meta}'
                        for key in current_active_indices:
                            line_string += f'{posting_list_separator}' \
                                f'{current_posting_lists[key]}'

                        line_string += '\n'
                        line_raw = line_string.encode()
                        f.write(line_raw)
                        term = current_min_term[1:-1].replace('""', '"')
                        self.seek_list.append((term, [current_offset]))
                        current_offset += len(line_raw)

                    # reload lines where necessary
                    for key in current_active_indices:
                        linetest = index_files[key].readline().decode('utf-8')
                        if linetest == '':
                            # end of file
                            global_active_indices[key] = False
                            global_active_file_count -= 1
                            print('one file out, '
                                  f'{global_active_file_count} remaining')
                        else:
                            line = linetest.rstrip('\n').split(
                                posting_list_separator, 2)
                            current_terms[key] = line[0]
                            current_meta[key] = int(line[1])
                            current_posting_lists[key] = line[2]

                    current_min_term = None
                    current_active_indices = []
                    terms_done += 1
                    if terms_done % 100000 == 0:
                        print(f'Merged {terms_done} terms.')

            self.seek_list = RecordDAWG('>Q', self.seek_list)
            self.seek_list.save(f'{self.directory}/seek_list.dawg')

            for f in index_files:
                f.close()

            for file_prefix in self.partial_index_names:
                file_path = file_prefix + '_index.csv'
                os.remove(file_path)

        self.huffman_compression(generate_encoding=False)

        with self.report.measure('processing authors & articles'):
            with open(f'{self.directory}/authors_list.pickle', mode='wb') as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/authors.csv'), f,
                    pickle.HIGHEST_PROTOCOL)

            with open(f'{self.directory}/articles_list.pickle', mode='wb') \
                    as f:
                pickle.dump(
                    create_list_from_csv(f'{self.directory}/articles.csv'), f,
                    pickle.HIGHEST_PROTOCOL)
    all_forms = [norm] + forms
    for form in all_forms:
        pr, sf = split(form, stem)

        prefixes.append(get_index(pr, ALL_PREFIXES))
        suffixes.append(get_index(sf, ALL_SUFFIXES))
        tags.append(get_index(ps, ALL_TAGS))
    # scheme = array.array('H', prefixes + suffixes + tags)
    scheme = prefixes + suffixes + tags
    if scheme not in ALL_SCHEMES:
        ALL_SCHEMES.append(scheme)
    scheme_id = ALL_SCHEMES.index(scheme)

    for i, form in enumerate(all_forms):
        ALL_MAP.append((form, (scheme_id, i)))

record_dawg = RecordDAWG(u">II", ALL_MAP)
record_dawg.save('words.dawg')

with open('ALL_PREFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_PREFIXES, fp, ensure_ascii=False)

with open('ALL_SUFFIXES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SUFFIXES, fp, ensure_ascii=False)

with open('ALL_TAGS.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_TAGS, fp, ensure_ascii=False)

with open('ALL_SCHEMES.json', 'w', encoding='utf-8') as fp:
    json.dump(ALL_SCHEMES, fp, ensure_ascii=False)