Exemple #1
0
def get_data():
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    terms = defaultdict(list)
    ind = 0
    urls = []
    for doc in reader:
        for word in set(doc2words.extract_words(doc.text)):
            terms[word].append(ind)
        ind += 1
        urls.append(doc.url)
    return terms, urls
Exemple #2
0
def create_index(args):
    reader = DocumentStreamReader(args[2:])
    if args[1] == 'varbyte':
        vocabulary = Vocabulary(Simple9)
    elif args[1] == 'simple9':
        vocabulary = Vocabulary(Simple9)
    else:
        raise AssertionError('Expected varbyte|simple9 as a compressor')

    for doc in reader:
        for word in extract_words(doc.text):
            vocabulary.append(word, doc.url)

    dump(args[0], vocabulary)
Exemple #3
0
def main():
    encMode = sys.argv[1]
    enc.changeMode(encMode)
    denc.changeMode(encMode)    
    with open(func.PATH + "urls.list", "a") as urls:
        docId = 0
        for path in sys.argv[2:]:
            for doc in DocumentStreamReader(path):
                urls.write(doc.url + "\n")
                parse(doc.text, docId)
                docId += 1
    flush()
    with open(func.PATH + "preDict.data", 'ab') as dct:
        dct.write(struct.pack('B', 1 if encMode == 'simple9' else 0))
Exemple #4
0
def make_dictionary_urlid():
    id_url = {}
    term_doc = {}
    reader = DocumentStreamReader(parse_command_line().files)
    i = 0
    for doc in reader:
        id_url[str(i)] = doc.url
        for word in extract_words(doc.text):
            if not (word in term_doc):
                term_doc[word] = []
                term_doc[word].append(i)
            elif term_doc[word][len(term_doc[word])-1] != i:
                term_doc[word].append(i)
        i += 1
    return term_doc, id_url
Exemple #5
0
def main(encoding, paths):
    reader = DocumentStreamReader(paths)

    if encoding == 'varbyte':
        encoder = VarbyteEncoder()
    elif encoding == 'simple9':
        encoder = Simple9Encoder()
    else:
        raise Exception("Unsupported encoding!")

    ct = time.clock()
    for doc in reader:
        url = doc.url
        words = set([w.encode('utf-8') for w in extract_words(doc.text)])
        encoder.add_document(url, words)

    encoder.write_to_file("index.txt")
    print "Time for index creation: {}".format(1000 * (time.clock() - ct))
def main():
    mhc = MinshinglesCounter()

    signatures = []
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            signatures.append((doc.url, mhc.count(doc.text)))
    size = len(signatures)
    for i in range(size):
        for j in range(i + 1, size):
            sig1 = signatures[i]
            sig2 = signatures[j]
            if sig1[1] is None or sig2[1] is None:
                continue
            shset1 = set(sig1[1])
            shset2 = set(sig2[1])
            jaccard = len(shset1 & shset2) / float(len(shset1 | shset2))
            if jaccard > 0.75:
                print ' '.join([sig1[0], sig2[0], str(jaccard)])
def main():
    mhc = MinshinglesCounter()

    res = {}
    doc_names = {}
    counter = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):

            counter += 1
            doc_names[counter] = doc.url
            minsh = mhc.count(doc.text)

            if minsh is None:
                continue

            for sh in minsh:
                tmp = res.pop(sh, [])
                tmp.append(counter)
                res[sh] = tmp

    count_pairs = {}
    for l in res:
        curr = res[l]
        for i in range(0, len(curr)):
            for j in range(i + 1, len(curr)):

                tmp = (curr[i], curr[j])

                if curr[i] > curr[j]:
                    tmp = (curr[j], curr[i])

                count_pairs[tmp] = count_pairs.pop(tmp, 0) + 1

    for (a, b) in count_pairs:

        m = count_pairs.get((a, b))
        if m is None or a == b or doc_names.get(a) == doc_names.get(b):
            continue
        l = (1.0 * m) / (mhc.n + mhc.n - m)
        if l > 0.75:
            print doc_names.get(a) + " " + doc_names.get(b) + " " + str(l)
Exemple #8
0
def main():
    data = []
    mhc = MinshinglesCounter()
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            mhc_c = mhc.count(doc.text)
            data.append((doc.url, mhc_c))

    for i in range(len(data)):
        if data[i][1] is None:
            continue
        for j in range(i + 1, len(data)):
            a = data[i][1]
            b = data[j][1]
            if b is None or data[i][0] == data[j][0]:
                continue
            ok = 0
            for x in a:
                if x in b:
                    ok += 1
            score = ok / float(ok + 2 * (20 - ok))
            if score >= 0.75:
                print("{} {} {}".format(data[i][0], data[j][0], score))
Exemple #9
0
def main():
    mhc = MinshinglesCounter()

    ids = []
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            lst = mhc.count(doc.text)
            if lst != None:
                ids.append((doc.url, lst))
        for i in range(len(ids)):
            for j in range(i + 1, len(ids)):
                sh1 = ids[i][1]
                sh2 = ids[j][1]
                u1 = ids[i][0]
                u2 = ids[j][0]
                if u1 == u2:
                    continue
                cnt = 0
                for x in sh1:
                    if x in sh2:
                        cnt += 1
                if cnt > 17:
                    print "%s %s %f" % (u1, u2, cnt / (cnt + 2 *
                                                       (20 - cnt) + 0.0))
def main():
    minshingle_dim = 20
    mhc = MinshinglesCounter()
    ind_url = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            if doc.url not in urls:
                urls[doc.url] = ind_url
                ind_url += 1
                mhc.count(doc.text, doc.url)
                #print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text, doc.url))

    global urls_intersect
    urls_intersect = numpy.zeros((len(urls), len(urls)))
    broder_merge()
    result_definition(minshingle_dim)
    """
    You may examine content of given files this way (as example):

    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text))
    """
    """
Exemple #11
0
import docreader
from docreader import DocumentStreamReader
import index_creation
import bitstream
import cPickle
import mmhash
import dict_hash

if __name__ == '__main__':
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    index = index_creation.Url_Index()
    for doc in reader:
        index.scan_text(doc)
    blob = []
    term = dict()
    for k, v in index.terms.iteritems():
        prev_len = len(blob)
        compr = bitstream.compress_varbyte(v)
        blob.extend(compr)
        term[mmhash.get_unsigned_hash(
            k.encode('utf8'))] = [prev_len, len(compr)]

    index_file = open("index.txt", "wb")
    index_file.write(bytearray(blob))

    url_file = open("url_file.txt", "wb")
    cPickle.dump(index.url, url_file)

    dict_hash.store_dict(term)
Exemple #12
0
    pass


def get_wordid(term):
    res = words.get_from_dict(term)
    if res is None:
        global word_count
        word_count += 1
        words.add(term, word_count)
        return word_count
    return res


if __name__ == '__main__':
    cmd = parse_command_line()
    reader = DocumentStreamReader(cmd.files)
    if cmd.code[0] == "varbyte":
        index = VarByte("docindex")
    else:
        index = Simple9("docindex")

    doc_count = -1

    for doc in reader:
        doc_count += 1
        add_doc(doc.url)

        terms = set(extract_words(doc.text))

        for term in terms:
            tmp = get_wordid(term)
Exemple #13
0
#!/usr/bin/env python
import argparse
import document_pb2
import struct
import gzip
import sys

from docreader import DocumentStreamReader
from index_creation import Index


def parse_command_line():
    parser = argparse.ArgumentParser(description='compressed documents reader')
    parser.add_argument('args',
                        nargs='+',
                        help='Input files (.gz or plain) to process')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_command_line().args
    compression = args.pop(0)
    reader = DocumentStreamReader(args)
    indexer = Index()
    for doc_id, doc in enumerate(reader):
        indexer.handle_doc(doc, doc_id)
Exemple #14
0
    parsed_line = parse_command_line().files

    try:
        os.remove('varbyte.bin')
    except:
        pass

    try:
        os.remove('simple9.bin')
    except:
        pass

    if parsed_line[0] == 'varbyte':
        with open('varbyte.bin', 'wb') as f:
            f.write('a')
    else:
        with open('simple9.bin', 'wb') as f:
            f.write('a')

    reader = DocumentStreamReader(parsed_line[1:])

    cnt = 0
    for doc in reader:
        expand_back_index(doc, cnt)
        cnt += 1

    fout = open('back_index.bin', 'w')
    fids = open('ids.bin', 'w')
    save(fout, fids)
    fout.close()
Exemple #15
0
def count_docs(argv):
    urls_count = defaultdict(int)
    for path in argv[1:]:
        for doc in DocumentStreamReader(path):
            urls_count[doc.url] += 1
    return urls_count
 def read_docs(self, pathlist):
     for path in pathlist:
         for doc in DocumentStreamReader(path):
             yield (doc.url, doc.text)