def get_data(): reader = DocumentStreamReader(docreader.parse_command_line().files) terms = defaultdict(list) ind = 0 urls = [] for doc in reader: for word in set(doc2words.extract_words(doc.text)): terms[word].append(ind) ind += 1 urls.append(doc.url) return terms, urls
def make_dictionary_urlid(): id_url = {} term_doc = {} reader = DocumentStreamReader(parse_command_line().files) i = 0 for doc in reader: id_url[str(i)] = doc.url for word in extract_words(doc.text): if not (word in term_doc): term_doc[word] = [] term_doc[word].append(i) elif term_doc[word][len(term_doc[word])-1] != i: term_doc[word].append(i) i += 1 return term_doc, id_url
def main(variant): with open('variant', 'w') as f: f.write(variant) encoder = Coder(variant) paths = [] chunk_num = 0 max_chunk_num = 2 while True: tokens = {} i = 1 if chunk_num == max_chunk_num: break documents = docreader.DocumentStreamReader( docreader.parse_command_line().files) for doc in documents: if chunk_num == 0: paths.append(doc.url) words = doc2words.extract_words(doc.text) for word in set(words): if word in tokens: tokens[word].append(i) elif len(word) % max_chunk_num == chunk_num: tokens[word] = array('l', [i]) i += 1 for token in tokens: tokens[token] = encoder.encode(tokens[token]) with open('index{}.pkl'.format(chunk_num), 'wb') as f: pickle.dump(tokens, f) chunk_num += 1 first = False with open('paths.pkl', 'wb') as f: pickle.dump(paths, f)
import sys import codecs import docreader import pickle from doc2words import extract_words from collections import defaultdict def save_obj(obj, name): with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) reader = docreader.DocumentStreamReader(docreader.parse_command_line().files) encoder_type = docreader.parse_command_line().encoder fd = open("encoder.txt", "w") fd.write(encoder_type) fd.close() URLs = {} InvIndex = defaultdict(list) for idx, doc in enumerate(reader): URLs[idx] = doc.url Terms = list(sorted(set(extract_words(doc.text)))) for term in Terms: InvIndex[term].append(idx) save_obj(InvIndex, "index") save_obj(URLs, "urls")
# -*- coding: utf-8 -*- import docreader import doc2words import varbyte import simple9 import pickle import mmh3 import json #urls = docreader.DocumentStreamReader(["../dataset/lenta.ru_4deb864d-3c46-45e6-85f4-a7ff7544a3fb_01.gz"]) arg = docreader.parse_command_line().files reader = docreader.DocumentStreamReader(arg[1:]) encoder_str = arg[0] if encoder_str == 'varbyte': encoder = varbyte elif encoder_str == 'simple9': encoder = simple9 #for i in urls: # print i.text.encode("utf-8") # break term_dictionary = {} url_list = [] doc_id = 0 for url in reader: doc_id += 1 url_list.append(url.url)
import docreader from docreader import DocumentStreamReader import index_creation import bitstream import cPickle import mmhash import dict_hash if __name__ == '__main__': reader = DocumentStreamReader(docreader.parse_command_line().files) index = index_creation.Url_Index() for doc in reader: index.scan_text(doc) blob = [] term = dict() for k, v in index.terms.iteritems(): prev_len = len(blob) compr = bitstream.compress_varbyte(v) blob.extend(compr) term[mmhash.get_unsigned_hash( k.encode('utf8'))] = [prev_len, len(compr)] index_file = open("index.txt", "wb") index_file.write(bytearray(blob)) url_file = open("url_file.txt", "wb") cPickle.dump(index.url, url_file) dict_hash.store_dict(term)
url_index.add_int(urls.add_string(url)) pass def get_wordid(term): res = words.get_from_dict(term) if res is None: global word_count word_count += 1 words.add(term, word_count) return word_count return res if __name__ == '__main__': cmd = parse_command_line() reader = DocumentStreamReader(cmd.files) if cmd.code[0] == "varbyte": index = VarByte("docindex") else: index = Simple9("docindex") doc_count = -1 for doc in reader: doc_count += 1 add_doc(doc.url) terms = set(extract_words(doc.text)) for term in terms:
def load(): fin = open('back_index.bin', 'r') data = pickle.load(fin) fin.close() fin = open('ids.bin', 'r') ids = pickle.load(fin) fin.close() return data, ids if __name__ == '__main__': parsed_line = parse_command_line().files try: os.remove('varbyte.bin') except: pass try: os.remove('simple9.bin') except: pass if parsed_line[0] == 'varbyte': with open('varbyte.bin', 'wb') as f: f.write('a') else: