Beispiel #1
0
def create_model(f, type='bigram'):
    res = {}
    for line in f:
        words = None
        parts = line.split('\t')
        if len(parts) == 1:
            continue
        elif len(parts) != 2:
            raise ValueError
        if type == 'unigram':
            add_uni_query(
                res,
                '='.join(doc2words.extract_words(parts[1].decode('utf8'))),
                '='.join(doc2words.extract_words(parts[0].decode('utf8'))))
        elif type == 'bigram':
            add_bi_query(
                res,
                '='.join(doc2words.extract_words(parts[1].decode('utf8'))),
                '='.join(doc2words.extract_words(parts[0].decode('utf8'))))
        else:
            raise ValueError
    if type == 'unigram':
        for k in res.keys():
            res[k] = np.log(res[k])
    elif type == 'bigram':
        for k in res.keys():
            for k1 in res[k].keys():
                for k2 in res[k][k1].keys():
                    res[k][k1][k2] = np.log(float(res[k][k1][k2]) / N)
    return res
Beispiel #2
0
def TakeShingles(text):
    words = doc2words.extract_words(text)
    shingles = []
    for i in xrange(len(words) - 5):
        shingle = ' '.join(words[i:i + 5])
        shingles.append(mmh3.hash(shingle.encode('utf-8')))
    return shingles
Beispiel #3
0
 def scan_text(self, doc):
     for word in set(doc2words.extract_words(doc.text)):
         # print word
         # word = unicode (word, "utf-8")
         self.terms[word].append(self.ind)
     self.ind += 1
     self.url.append(doc.url)
def create_dict_varbyte(dict_file_name, support_file_name, info_file_name, url_file_name, reader, if_continue):
    url_file = open(url_file_name, 'a')

    if not if_continue:
        # hash of term to string on bytes in encoding (string == list of deltas between docid)
        dict_ = defaultdict(str)
        # hash of term to last_docid in list of deltas
        last_docid = defaultdict(int)

        # number of max docid
        max_docid = 0
        # size of all lists
        lists_size = 0

        max_docid_prev = 0
    else:
        dict_pickle_file = open(dict_file_name + 'pickle', 'rb')
        dict_ = pickle.load(dict_pickle_file)
        dict_pickle_file.close()

        last_docid_pickle_file = open(support_file_name + '_last_pickle', 'rb')
        last_docid = pickle.load(last_docid_pickle_file)
        last_docid_pickle_file.close()

        max_docid_pickle_file = open(support_file_name + '_max_pickle', 'rb')
        max_docid_prev = pickle.load(max_docid_pickle_file)
        max_docid = max_docid_prev
        max_docid_pickle_file.close()

        lists_size_pickle_file = open(support_file_name + '_size_pickle', 'rb')
        lists_size = pickle.load(lists_size_pickle_file)
        lists_size_pickle_file.close()

    for docid, doc in enumerate(reader):
        if if_continue:
            docid += max_docid_prev
        max_docid = docid + 1
        url_file.write(doc.url + '\n')
        lists_size += create_dict_part_varbyte(dict_, last_docid, extract_words(doc.text), docid + 1)

    if not if_continue:
        dict_pickle_file = open(dict_file_name + 'pickle', 'wb')
        pickle.dump(dict_, dict_pickle_file)
        dict_pickle_file.close()

        last_docid_pickle_file = open(support_file_name + '_last_pickle', 'wb')
        pickle.dump(last_docid, last_docid_pickle_file)
        last_docid_pickle_file.close()

        max_docid_pickle_file = open(support_file_name + '_max_pickle', 'wb')
        pickle.dump(max_docid, max_docid_pickle_file)
        max_docid_pickle_file.close()

        lists_size_pickle_file = open(support_file_name + '_size_pickle', 'wb')
        pickle.dump(lists_size, lists_size_pickle_file)
        lists_size_pickle_file.close()
        quit()

    write_stuff(dict_file_name, support_file_name, info_file_name,
                dict_, lists_size, max_docid, encoding_varbyte_code)
Beispiel #5
0
 def handle_doc(self, doc, doc_id):
     self.links.append(doc.url)
     words = set(extract_words(doc.text));
     for word in words:
         word_hash = get_hash(word.encode("UTF-8"))
         last_id, arr = self.index.get(word_hash, (0, list()))
         arr.append(doc_id - last_id)
         self.index[word_hash] = (doc_id, arr)
Beispiel #6
0
def query_model(f):
    res = Counter()
    N = 0
    for line in f:
        line = line.decode('utf8')
        words = None
        parts = line.split('\t')
        if len(parts) == 1:
            words = doc2words.extract_words(parts[0])
        elif len(parts) == 2:
            words = doc2words.extract_words(parts[1])
        else:
            raise ValueError
        query = ' '.join(words)
        res[query] += 1
        N += 1
    for k in res.keys():
        res[k] = np.log(float(res[k]) / N)
    return res
Beispiel #7
0
def unigram_model(f):
    res = Counter()
    N = 0
    for line in f:
        line = line.decode('utf8')
        words = None
        parts = line.split('\t')
        if len(parts) == 1:
            words = doc2words.extract_words(parts[0])
        elif len(parts) == 2:
            words = doc2words.extract_words(parts[1])
        else:
            raise ValueError
        N += len(words)
        for word in words:
            res[word] += 1
    for k in res.keys():
        res[k] = np.log(float(res[k]) / N)
    return res
Beispiel #8
0
def get_data():
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    terms = defaultdict(list)
    ind = 0
    urls = []
    for doc in reader:
        for word in set(doc2words.extract_words(doc.text)):
            terms[word].append(ind)
        ind += 1
        urls.append(doc.url)
    return terms, urls
Beispiel #9
0
def create_index(args):
    reader = DocumentStreamReader(args[2:])
    if args[1] == 'varbyte':
        vocabulary = Vocabulary(Simple9)
    elif args[1] == 'simple9':
        vocabulary = Vocabulary(Simple9)
    else:
        raise AssertionError('Expected varbyte|simple9 as a compressor')

    for doc in reader:
        for word in extract_words(doc.text):
            vocabulary.append(word, doc.url)

    dump(args[0], vocabulary)
Beispiel #10
0
def bigram_model(f):
    res = {'': Counter()}
    N = 0
    for line in f:
        line = line.decode('utf8')
        words = None
        parts = line.split('\t')
        if len(parts) == 1:
            words = doc2words.extract_words(parts[0])
        elif len(parts) == 2:
            words = doc2words.extract_words(parts[1])
        else:
            raise ValueError
        N += len(words)
        for i in range(len(words) - 1):
            if words[i] not in res:
                res[words[i]] = Counter()
            res[words[i]][words[i + 1]] += 1
        if len(words) > 0:
            res[''][words[0]] += 1
    for k in res.keys():
        for k1 in res[k].keys():
            res[k][k1] = np.log(float(res[k][k1]) / N)
    return res
Beispiel #11
0
def make_dictionary_urlid():
    id_url = {}
    term_doc = {}
    reader = DocumentStreamReader(parse_command_line().files)
    i = 0
    for doc in reader:
        id_url[str(i)] = doc.url
        for word in extract_words(doc.text):
            if not (word in term_doc):
                term_doc[word] = []
                term_doc[word].append(i)
            elif term_doc[word][len(term_doc[word])-1] != i:
                term_doc[word].append(i)
        i += 1
    return term_doc, id_url
Beispiel #12
0
def expand_back_index(doc, id):
    global ids

    ids[id] = doc.url

    words = set(extract_words(doc.text))

    for word in words:
        h = mmh3.hash(word.encode('utf-8'))

        if h in res:
            res[h].extend(struct.pack('I', id))
        else:
            res[h] = bytearray()
            res[h].extend(struct.pack('I', id))
Beispiel #13
0
def estimate_query(query):
    words = doc2words.extract_words(query)
    if len(words) == 0:
        return 0
    w = 0
    if words[0].encode('utf8') not in l_model['']:
        w -= 10000
    else:
        w += l_model[''][words[0].encode('utf8')]
    for i in range(1, len(words)):
        if words[i - 1].encode('utf8') not in l_model:
            w -= 10000
        elif words[i].encode('utf8') not in l_model[words[i -
                                                          1].encode('utf8')]:
            w -= 10000
        else:
            w += l_model[words[i - 1].encode('utf8')][words[i].encode('utf8')]
    return w
Beispiel #14
0
def main(encoding, paths):
    reader = DocumentStreamReader(paths)

    if encoding == 'varbyte':
        encoder = VarbyteEncoder()
    elif encoding == 'simple9':
        encoder = Simple9Encoder()
    else:
        raise Exception("Unsupported encoding!")

    ct = time.clock()
    for doc in reader:
        url = doc.url
        words = set([w.encode('utf-8') for w in extract_words(doc.text)])
        encoder.add_document(url, words)

    encoder.write_to_file("index.txt")
    print "Time for index creation: {}".format(1000 * (time.clock() - ct))
Beispiel #15
0
def index_data(reader, archive_type):
    global doc_id, dic
    # num = 0

    for doc in reader:
        text = extract_words(doc.text)
        index_doc(doc.url, text)
        # num += 1
        # if not (num % 10000):
        #    print sys.getsizeof(dic) / (1024 * 1024)
        # if sys.getsizeof(dic) > 5e2:
        #    save_data(num)
        #    dic.clear()
        #    num += 1
        # save_data(num)

    code_data(dic, archive_type)
    save_obj([archive_type, dic], 'Data/compressed_dict.pckl')
    save_obj([archive_type, doc_id], 'Data/compressed_id.pckl')
def main(variant):
    with open('variant', 'w') as f:
        f.write(variant)

    encoder = Coder(variant)
    paths = []
    chunk_num = 0
    max_chunk_num = 2

    while True:
        tokens = {}
        i = 1
        if chunk_num == max_chunk_num:
            break

        documents = docreader.DocumentStreamReader(
            docreader.parse_command_line().files)
        for doc in documents:
            if chunk_num == 0:
                paths.append(doc.url)

            words = doc2words.extract_words(doc.text)

            for word in set(words):
                if word in tokens:
                    tokens[word].append(i)
                elif len(word) % max_chunk_num == chunk_num:
                    tokens[word] = array('l', [i])

            i += 1

        for token in tokens:
            tokens[token] = encoder.encode(tokens[token])

        with open('index{}.pkl'.format(chunk_num), 'wb') as f:
            pickle.dump(tokens, f)

        chunk_num += 1
        first = False

    with open('paths.pkl', 'wb') as f:
        pickle.dump(paths, f)
Beispiel #17
0
    def make_dict(self, doc_reader):
        for docID, doc in enumerate(doc_reader):
            url = doc.url # не буду делать DocID <=> doc.url
            body = doc.body # Там нет ни в одном документе тела
            text = doc.text
            words = doc2words.extract_words(text)
            self.urls[docID] = url
#             if docID == 21: 
#                 print doc.url
#                 print "BODY\n", doc.body
#                 print "TEXT\n", doc.text
#                 for word in words:
#                     if self.hash(word) == self.hash(u'сша'): print word
            for word in words:
                termID = self.hash(word)
                if termID in self.dict:
                    if docID not in self.dict[termID]:
                        self.dict[termID] = np.concatenate((self.dict[termID], [docID]))
                else:
                    self.dict[termID] = np.asarray([docID])
Beispiel #18
0
def build_index_files(paths, max_doc_ids_per_index_file=10 ** 6):
    reader = docreader.DocumentStreamReader(paths)

    doc_urls = dict()

    next_file_id = 0
    last_doc_ids = 0
    file_names = []

    index = dict()
    for doc_id, doc in enumerate(reader):
        doc_urls[doc_id] = doc.url
        words = doc2words.extract_words(doc.text)
        for word in words:
            doc_ids = index.get(word, set())
            index[word] = doc_ids
            if doc_id not in doc_ids:
                last_doc_ids += 1
            doc_ids.add(doc_id)

            if last_doc_ids >= max_doc_ids_per_index_file:
                index_file_name = __get_index_file_name(next_file_id)
                file_names.append(index_file_name)
                dump_index_to_file(index, index_file_name)
                next_file_id += 1
                last_doc_ids = 0
                index.clear()

    if index:
        index_file_name = __get_index_file_name(next_file_id)
        file_names.append(index_file_name)
        dump_index_to_file(index, index_file_name)
        next_file_id += 1
        index.clear()

    return doc_urls, file_names
def create_dict_simple9(dict_file_name, support_file_name, info_file_name, url_file_name, reader):
    url_file = open(url_file_name, 'a')

    # hash of term to uncompressed list of deltas between docid
    dict_ = defaultdict(list)
    # hash of term to last_docid in list of deltas
    last_docid = defaultdict(int)

    # number of max docid
    max_docid = 0
    # size of all lists
    lists_size = 0

    for docid, doc in enumerate(reader):
        max_docid = docid + 1
        url_file.write(doc.url + '\n')
        create_dict_part_simple9(dict_, last_docid, extract_words(doc.text), docid + 1)

    for term_hash, list_ in dict_.iteritems():
        dict_[term_hash] = compress_list_simple9(list_)
        lists_size += len(dict_[term_hash])

    write_stuff(dict_file_name, support_file_name, info_file_name,
                dict_, lists_size, max_docid, encoding_simple9_code)
import codecs
import docreader
import pickle
from doc2words import extract_words
from collections import defaultdict


def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)
reader = docreader.DocumentStreamReader(docreader.parse_command_line().files)
encoder_type = docreader.parse_command_line().encoder
fd = open("encoder.txt", "w")
fd.write(encoder_type)
fd.close()

URLs = {}
InvIndex = defaultdict(list)
for idx, doc in enumerate(reader):
    URLs[idx] = doc.url
    Terms = list(sorted(set(extract_words(doc.text))))
    for term in Terms:
        InvIndex[term].append(idx)

save_obj(InvIndex, "index")
save_obj(URLs, "urls")
Beispiel #21
0
    res = bytearray()
    for id in docids:
        res += code_to_byte(id)
    res = struct.pack('I', len(res)) + res
    return res


if __name__ == '__main__':
    reader = docreader.DocumentStreamReader(
        docreader.parse_command_line().files)
    mdict = defaultdict(lambda: [])
    urls = []

    for doc in reader:
        urls.append(doc.url)
        for word in extract_words(doc.text):
            mdict[word].append(len(urls))
    for term in mdict.keys():
        docids = mdict[term]
        for i in reversed(range(1, len(docids))):
            docids[i] = docids[i] - docids[i - 1]
        docids = filter(lambda x: x != 0, docids)
        mdict[term] = docids

    for word in mdict.keys():
        mdict[word] = code_varbyte(mdict[word])

    id_url = {}
    term_position = {}
    with open('index', 'wb') as f:
        for term, coded_ids in mdict.iteritems():
    encoder = varbyte
elif encoder_str == 'simple9':
    encoder = simple9

#for i in urls:
#    print i.text.encode("utf-8")
#    break

term_dictionary = {}
url_list = []

doc_id = 0
for url in reader:
    doc_id += 1
    url_list.append(url.url)
    words = doc2words.extract_words(url.text)
    uniq_words = list(set(words))

    for word in uniq_words:
        #print mmh3.hash()
        hash = abs(mmh3.hash(word.encode("utf-8")))
        if (term_dictionary.get(hash)):
            term_dictionary[hash].append(doc_id)
        else:
            term_dictionary[hash] = []
            term_dictionary[hash].append(doc_id)


print term_dictionary[abs(mmh3.hash("энергоносители"))]
for key in term_dictionary:
    term_dictionary[key] = encoder.compress(term_dictionary[key])
Beispiel #23
0
if encoder_arg == 'varbyte':
    encoder = varbyte
else:
    print "Unsupported encoder"
    exit()

dictionary = {}
urls = []
""" Reading dataset file """
counter = 0
for entry in docreader.DocumentStreamReader(archive_args):
    urls.append(entry.url)
    counter += 1

    for word in set(doc2words.extract_words(entry.text)):
        hash = abs(mmh3.hash(word.encode("utf-8").lower()))
        if not dictionary.get(hash):
            dictionary[hash] = []

        dictionary[hash].append(counter)
""" Compressing dictionary """
dictionary = {
    entry: [encoder.encode(id) for id in dictionary[entry]]
    for entry in dictionary
}
""" Storing index in memory """
desc = open("./index_encoder", "w")
desc.write(encoder_arg)
desc.close()
Beispiel #24
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import doc2words
import operator

image = {}
web = {}

with open('image_queries.txt', 'r') as f:
    for line in f:
        words = doc2words.extract_words(line)
        for word in words:
            if word in image:
                image[word] += 1
            else:
                image[word] = 1

with open('web_queries.txt', 'r') as f:
    for line in f:
        words = doc2words.extract_words(line)
        for word in words:
            if word in web:
                web[word] += 1
            else:
                web[word] = 1

proba = {}

for word in image:
    if word in web:
Beispiel #25
0
        pickle.dump('varbyte', open('coding_type.txt', 'w'))
    elif coding == 'simple9':
        ''' he is working very slowly, so i cannot pass 25, using simple9 (not searching, just indexing)
        coding = simple9
        encoding = simple9_decode
        pickle.dump('simple9', open('coding_type.txt', 'w'))
        #'''
        coding = varbyte
        encoding = varbyte_decode
        pickle.dump('varbyte', open('coding_type.txt', 'w'))
        #'''
    else:
        raise NotImplemented

    reader = DocumentStreamReader(files)
    d = defaultdict(list)
    docid = -1
    docids = dict()
    for doc in reader:
        docid += 1
        words = extract_words(doc.text)
        for word in words:
            d[word].append(docid)
        docids[docid] = doc.url
    Pack(d, docids, coding)
    '''
    for key in docids.keys():
        print(key)
        print(docids[key])
    #'''
Beispiel #26
0

compression_type = argv[1]
assert (compression_type == 'varbyte' or compression_type == 'simple9')

files_gz = argv[2:]


docReader = docreader.DocumentStreamReader(files_gz)

# parse texts and create index
print 'parse text'
index_dict = {}
index_url = []
for idx, doc in enumerate(docReader):
    words = doc2words.extract_words(doc.text)
    index_url.append(doc.url)

    for word in words:
        if word in index_dict:
            if index_dict[word][-1] != idx:
                index_dict[word].append(idx)
        else:
            index_dict[word] = [idx]


# save to pickle
cPickle.dump(index_dict, open('./pickle/index_dict.p', 'w'))


# write urls
Beispiel #27
0
    path = './index/'
    if not os.path.exists(path):
        os.makedirs(path)

    index, urls = {}, []

    reader = DocumentStreamReader(files)
    n_files, batch_size = 0, 5e4

    need_to_dump = False

    for doc_i, doc in enumerate(reader):
        need_to_dump = True

        urls.append(doc.url + '\n')
        terms = set(extract_words(doc.text))
        for term in terms:
            try:
                index[hash(term)].append(doc_i)
            except KeyError:
                # 0 is a fake document. This principle helps to split index into multiple files
                index[hash(term)] = [0, doc_i]

        if (doc_i + 1) % batch_size == 0:
            dump_index_part(path + 'part_{0:03d}'.format(n_files), index,
                            encoding)
            for key in index.keys():
                # next doc will be encoded correctly
                index[key] = [index[key][-1]]
            n_files += 1
            need_to_dump = False
Beispiel #28
0
 def __init__(self, query):
     self.query = query
     self.words = doc2words.extract_words(query)
Beispiel #29
0
#!/usr/bin/env python

from docindex import Docindex
from doc2words import extract_words

doc_texts = [
    {
        'url': '/get-set-update',
        'text': 'update get set'
    },
    {
        'url': '/get-set',
        'text': 'get set'
    },
    {
        'url': '/set',
        'text': 'set'
    },
]

if __name__ == '__main__':
    di = Docindex()
    for doc in doc_texts:
        print "%s\t%d bytes" % (doc['url'], len(doc['text']))
        words = extract_words(doc['text'])
        di.add_doc(doc['url'], words)
    di.to_file('index.pickle')