def main(variant): with open('variant', 'w') as f: f.write(variant) encoder = Coder(variant) paths = [] chunk_num = 0 max_chunk_num = 2 while True: tokens = {} i = 1 if chunk_num == max_chunk_num: break documents = docreader.DocumentStreamReader( docreader.parse_command_line().files) for doc in documents: if chunk_num == 0: paths.append(doc.url) words = doc2words.extract_words(doc.text) for word in set(words): if word in tokens: tokens[word].append(i) elif len(word) % max_chunk_num == chunk_num: tokens[word] = array('l', [i]) i += 1 for token in tokens: tokens[token] = encoder.encode(tokens[token]) with open('index{}.pkl'.format(chunk_num), 'wb') as f: pickle.dump(tokens, f) chunk_num += 1 first = False with open('paths.pkl', 'wb') as f: pickle.dump(paths, f)
def main(): minshingles_count = 20 files = sys.argv[1:] docs = docreader.DocumentStreamReader(files) minshingles_counter = MinshinglesCounter(window=5, n=minshingles_count) minshingle2urls = defaultdict(list) id2url = [] url_index = 0 for doc in docs: minshingles = minshingles_counter.count( TextNormalizer.normalize(doc.text)) if None not in minshingles: id2url.append(doc.url) for minshingle_id, minshingle in enumerate(minshingles): minshingle2urls[(minshingle_id, minshingle)].append(url_index) url_index += 1 urls_matrix = np.zeros((len(id2url), len(id2url))) for minshingle, url_ids in minshingle2urls.iteritems(): count = 0 for id_i in url_ids: count += 1 for id_j in url_ids[count:]: urls_matrix[id_i, id_j] += 1 count = 0 for id_i in range(len(id2url)): count += 1 for id_j in range(count, len(id2url)): if id_j > id_i: measure = float(urls_matrix[id_i, id_j]) / minshingles_count if measure > 0.75: print id2url[id_i], id2url[id_j], measure
import sys import codecs import docreader import pickle from doc2words import extract_words from collections import defaultdict def save_obj(obj, name): with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) sys.stdout = codecs.getwriter('utf8')(sys.stdout) sys.stderr = codecs.getwriter('utf8')(sys.stderr) reader = docreader.DocumentStreamReader(docreader.parse_command_line().files) encoder_type = docreader.parse_command_line().encoder fd = open("encoder.txt", "w") fd.write(encoder_type) fd.close() URLs = {} InvIndex = defaultdict(list) for idx, doc in enumerate(reader): URLs[idx] = doc.url Terms = list(sorted(set(extract_words(doc.text)))) for term in Terms: InvIndex[term].append(idx) save_obj(InvIndex, "index") save_obj(URLs, "urls")
# -*- coding: utf-8 -*- import docreader import doc2words import varbyte import simple9 import pickle import mmh3 import json #urls = docreader.DocumentStreamReader(["../dataset/lenta.ru_4deb864d-3c46-45e6-85f4-a7ff7544a3fb_01.gz"]) arg = docreader.parse_command_line().files reader = docreader.DocumentStreamReader(arg[1:]) encoder_str = arg[0] if encoder_str == 'varbyte': encoder = varbyte elif encoder_str == 'simple9': encoder = simple9 #for i in urls: # print i.text.encode("utf-8") # break term_dictionary = {} url_list = [] doc_id = 0 for url in reader: doc_id += 1 url_list.append(url.url)
import pickle import sys import time import numpy as np import docreader import simple9 import varbyte if __name__ == '__main__': test = np.array([1, 2, 3]) encoding = sys.argv[1] files = sys.argv[2:] data = {'encoding': encoding, 'index': {}, 'urls': []} reader = docreader.DocumentStreamReader(files) buf = dict() # for simple9 start_time = time.time() for doc in reader: words = set(docreader.extract_words(doc.text)) data['urls'].append(doc.url.encode('utf-8')) words = [w.encode('utf-8') for w in words] url_pos = len(data['urls']) - 1 for w in words: if encoding == 'varbyte': if w in data['index']: data['index'][w] += varbyte.vb_encode(url_pos) else: data['index'][w] = varbyte.vb_encode(url_pos) elif encoding == 'simple9': if w in data['index']:
import doc2words from collections import defaultdict import pickle args = docreader.parse_command_line().files compressor_type = args[0] if compressor_type == 'varbyte': import varbyte compressor = varbyte elif compressor_type == 'simple9': import simple9 compressor = simple9 docs = docreader.DocumentStreamReader(args[1:]) doc_id = 1 term2doc = defaultdict(list) doc_id2url = [] for doc in docs: doc_id2url.append(doc.url) words = doc2words.extract_words(doc.text) unique_words = set(words) for word in unique_words: key = abs(hash(word.encode('utf-8'))) term2doc[key].append(doc_id) doc_id += 1 for key in term2doc: term2doc[key] = compressor.code(term2doc[key])
parser.add_argument("files", nargs="+", help="Input files (.gz or plain) to process") args = parser.parse_args() if args.method == "varbyte": encoder = varbyte elif args.method == "simple9": encoder = simple9 else: raise AssertionError( "Method {name} is not supported".format(name=args.method)) inverted_index = collections.defaultdict(list) urls = [] reader = docreader.DocumentStreamReader(args.files) for (doc_id, doc) in enumerate(reader): urls.append(doc.url) words = doc2words.extract_words(doc.text) for word in set(words): word = word.encode("utf-8") inverted_index[hash(word)].append(doc_id) for key in inverted_index: inverted_index[key] = encoder.encode(inverted_index[key]) with open("index", "w") as file_index: pickle.dump(args.method, file_index) pickle.dump(inverted_index, file_index) file_index.close()
args = docreader.parse_command_line().files encoder_arg = args[0] archive_args = args[1:] if encoder_arg == 'varbyte': encoder = varbyte else: print "Unsupported encoder" exit() dictionary = {} urls = [] """ Reading dataset file """ counter = 0 for entry in docreader.DocumentStreamReader(archive_args): urls.append(entry.url) counter += 1 for word in set(doc2words.extract_words(entry.text)): hash = abs(mmh3.hash(word.encode("utf-8").lower())) if not dictionary.get(hash): dictionary[hash] = [] dictionary[hash].append(counter) """ Compressing dictionary """ dictionary = { entry: [encoder.encode(id) for id in dictionary[entry]] for entry in dictionary } """ Storing index in memory """