Ejemplo n.º 1
0
def main(variant):
    with open('variant', 'w') as f:
        f.write(variant)

    encoder = Coder(variant)
    paths = []
    chunk_num = 0
    max_chunk_num = 2

    while True:
        tokens = {}
        i = 1
        if chunk_num == max_chunk_num:
            break

        documents = docreader.DocumentStreamReader(
            docreader.parse_command_line().files)
        for doc in documents:
            if chunk_num == 0:
                paths.append(doc.url)

            words = doc2words.extract_words(doc.text)

            for word in set(words):
                if word in tokens:
                    tokens[word].append(i)
                elif len(word) % max_chunk_num == chunk_num:
                    tokens[word] = array('l', [i])

            i += 1

        for token in tokens:
            tokens[token] = encoder.encode(tokens[token])

        with open('index{}.pkl'.format(chunk_num), 'wb') as f:
            pickle.dump(tokens, f)

        chunk_num += 1
        first = False

    with open('paths.pkl', 'wb') as f:
        pickle.dump(paths, f)
def main():
    minshingles_count = 20

    files = sys.argv[1:]
    docs = docreader.DocumentStreamReader(files)
    minshingles_counter = MinshinglesCounter(window=5, n=minshingles_count)

    minshingle2urls = defaultdict(list)
    id2url = []

    url_index = 0
    for doc in docs:
        minshingles = minshingles_counter.count(
            TextNormalizer.normalize(doc.text))
        if None not in minshingles:
            id2url.append(doc.url)
            for minshingle_id, minshingle in enumerate(minshingles):
                minshingle2urls[(minshingle_id, minshingle)].append(url_index)
            url_index += 1

    urls_matrix = np.zeros((len(id2url), len(id2url)))

    for minshingle, url_ids in minshingle2urls.iteritems():
        count = 0
        for id_i in url_ids:
            count += 1
            for id_j in url_ids[count:]:
                urls_matrix[id_i, id_j] += 1

    count = 0
    for id_i in range(len(id2url)):
        count += 1
        for id_j in range(count, len(id2url)):
            if id_j > id_i:
                measure = float(urls_matrix[id_i, id_j]) / minshingles_count
                if measure > 0.75:
                    print id2url[id_i], id2url[id_j], measure
import sys
import codecs
import docreader
import pickle
from doc2words import extract_words
from collections import defaultdict


def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)
reader = docreader.DocumentStreamReader(docreader.parse_command_line().files)
encoder_type = docreader.parse_command_line().encoder
fd = open("encoder.txt", "w")
fd.write(encoder_type)
fd.close()

URLs = {}
InvIndex = defaultdict(list)
for idx, doc in enumerate(reader):
    URLs[idx] = doc.url
    Terms = list(sorted(set(extract_words(doc.text))))
    for term in Terms:
        InvIndex[term].append(idx)

save_obj(InvIndex, "index")
save_obj(URLs, "urls")
# -*- coding: utf-8 -*-

import docreader
import doc2words
import varbyte
import simple9
import pickle
import mmh3
import json

#urls = docreader.DocumentStreamReader(["../dataset/lenta.ru_4deb864d-3c46-45e6-85f4-a7ff7544a3fb_01.gz"])
arg = docreader.parse_command_line().files
reader = docreader.DocumentStreamReader(arg[1:])
encoder_str = arg[0]

if encoder_str == 'varbyte':
    encoder = varbyte
elif encoder_str == 'simple9':
    encoder = simple9

#for i in urls:
#    print i.text.encode("utf-8")
#    break

term_dictionary = {}
url_list = []

doc_id = 0
for url in reader:
    doc_id += 1
    url_list.append(url.url)
Ejemplo n.º 5
0
import pickle
import sys
import time

import numpy as np

import docreader
import simple9
import varbyte

if __name__ == '__main__':
    test = np.array([1, 2, 3])
    encoding = sys.argv[1]
    files = sys.argv[2:]
    data = {'encoding': encoding, 'index': {}, 'urls': []}
    reader = docreader.DocumentStreamReader(files)
    buf = dict()  # for simple9
    start_time = time.time()
    for doc in reader:
        words = set(docreader.extract_words(doc.text))
        data['urls'].append(doc.url.encode('utf-8'))
        words = [w.encode('utf-8') for w in words]
        url_pos = len(data['urls']) - 1
        for w in words:
            if encoding == 'varbyte':
                if w in data['index']:
                    data['index'][w] += varbyte.vb_encode(url_pos)
                else:
                    data['index'][w] = varbyte.vb_encode(url_pos)
            elif encoding == 'simple9':
                if w in data['index']:
Ejemplo n.º 6
0
import doc2words
from collections import defaultdict
import pickle

args = docreader.parse_command_line().files

compressor_type = args[0]

if compressor_type == 'varbyte':
	import varbyte
	compressor = varbyte
elif compressor_type == 'simple9':
	import simple9
	compressor = simple9

docs = docreader.DocumentStreamReader(args[1:])

doc_id = 1
term2doc = defaultdict(list)
doc_id2url = []
for doc in docs:
	doc_id2url.append(doc.url)
	words = doc2words.extract_words(doc.text)
	unique_words = set(words)
	for word in unique_words:
		key = abs(hash(word.encode('utf-8')))
		term2doc[key].append(doc_id)
	doc_id += 1

for key in term2doc:
	term2doc[key] = compressor.code(term2doc[key])
Ejemplo n.º 7
0
parser.add_argument("files",
                    nargs="+",
                    help="Input files (.gz or plain) to process")
args = parser.parse_args()

if args.method == "varbyte":
    encoder = varbyte
elif args.method == "simple9":
    encoder = simple9
else:
    raise AssertionError(
        "Method {name} is not supported".format(name=args.method))

inverted_index = collections.defaultdict(list)
urls = []
reader = docreader.DocumentStreamReader(args.files)

for (doc_id, doc) in enumerate(reader):
    urls.append(doc.url)
    words = doc2words.extract_words(doc.text)
    for word in set(words):
        word = word.encode("utf-8")
        inverted_index[hash(word)].append(doc_id)

for key in inverted_index:
    inverted_index[key] = encoder.encode(inverted_index[key])

with open("index", "w") as file_index:
    pickle.dump(args.method, file_index)
    pickle.dump(inverted_index, file_index)
file_index.close()
Ejemplo n.º 8
0
args = docreader.parse_command_line().files
encoder_arg = args[0]
archive_args = args[1:]

if encoder_arg == 'varbyte':
    encoder = varbyte
else:
    print "Unsupported encoder"
    exit()

dictionary = {}
urls = []
""" Reading dataset file """
counter = 0
for entry in docreader.DocumentStreamReader(archive_args):
    urls.append(entry.url)
    counter += 1

    for word in set(doc2words.extract_words(entry.text)):
        hash = abs(mmh3.hash(word.encode("utf-8").lower()))
        if not dictionary.get(hash):
            dictionary[hash] = []

        dictionary[hash].append(counter)
""" Compressing dictionary """
dictionary = {
    entry: [encoder.encode(id) for id in dictionary[entry]]
    for entry in dictionary
}
""" Storing index in memory """