Example #1
0
def main():
    mhc = MinshinglesCounter()
    URLs = []
    pairs = []
    docid = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            URLs.append(doc.url)
            minshingles = mhc.count(doc.text)
            if minshingles:
                for msh in minshingles:
                    pairs.append((msh, docid))
            docid += 1

    pairs.sort()
    grouped_docs = []
    for key, val in groupby(pairs, itemgetter(0)):
        lst = list(val)
        if (len(lst) >= 2):
            grouped_docs.append(sorted(set([x[1] for x in lst])))

    counts = Counter()
    for doc in grouped_docs:
        for c in combinations(doc, 2):
            counts[c] += 1

    for key in counts:
        N = counts[key]
        similarity = N / float(N + 2 * (20.0 - N))
        if (similarity >= 0.75):
            print "%s %s %f" % (URLs[key[0]], URLs[key[1]], similarity)

    return
Example #2
0
def main():
    mhc = MinshinglesCounter()
    """
    You may examine content of given files this way (as example):

    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text))
    """
    """
    Write your actual code here.
    Good luck!
    """
    urls = list()
    mshs = dict()
    i = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            if doc.url not in urls:
                mshs_loc = mhc.count(doc.text)
                if mshs_loc:
                    urls.append(doc.url)
                    for j_msh in mshs_loc:
                        mshs[j_msh] = mshs.get(j_msh, list())
                        mshs[j_msh].append(i)
                    i += 1
    counter = dict()
    for item in (x for x in mshs.values() if len(x) > 1):
        for pair in (x for x in permutations(item, 2) if x[0] < x[1]):
            counter[pair] = counter.get(pair, 0) + 1
    for item, value in counter.items():
        if value > LIMIT:
            print urls[item[0]], urls[item[1]], float(value) / float(POWER)
Example #3
0
def main():
    files = sys.argv[1:]  # all input files in current test case
    minshingles_counter = MinshinglesCounter(window=5, n=20)
    reader = DocumentStreamReader(files)

    # docid -> set of "Msh_pos"
    groups = []
    # docid -> url
    urls = []

    for idx, doc in enumerate(reader):
        shs = minshingles_counter.count(normalize_text(doc.text))
        urls.append(doc.url)
        sh_idx = set()
        if shs:
            for jdx in xrange(20):
                sh_idx.add(str(shs[jdx]) + '_' + str(jdx))
        groups.append(sh_idx)

    for idx, i_group in enumerate(groups):
        for jdx, j_group in enumerate(groups):
            if idx < jdx:
                measure = float(len(i_group & j_group)) / 20.
                if measure > .75:
                    print urls[idx], urls[jdx], measure
Example #4
0
def main():
    mhc = MinshinglesCounter()

    ids = []
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            #            print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text))
            lst = mhc.count(doc.text)
            if lst != None:
                for x in lst:
                    ids.append((x, doc.url))
#    print(len(ids))
    ids = list(set(ids))
    ids.sort()
    #    print(len(ids))
    ids2 = []

    mm = []
    for i in range(len(ids)):
        cur_sh = ids[i][0]
        cnt = 1
        while i + cnt < len(ids) and ids[i + cnt][0] == cur_sh:
            cnt += 1
        mm.append(cnt)

    mm.append(0)
    mm.sort()

    bb = max(mm[-2], 1900)

    i = 0
    while i < len(ids):
        cur_sh = ids[i][0]
        cur_id = ids[i][1]
        cnt = 1
        while i + cnt < len(ids) and ids[i + cnt][0] == cur_sh:
            cnt += 1
        if cnt >= bb:
            i += cnt
            continue
        j = i + 1
        while j < len(ids) and ids[j][0] == cur_sh:
            ids2.append((cur_id, ids[j][1]))
            j += 1
        i += 1


#    print('\n'.join(list(map(str, ids2[:5]))))
    ids2.sort()
    #    print(len(ids2))
    i = 0
    while i < len(ids2):
        pos = i + 1
        while pos < len(ids2) and ids2[pos] == ids2[i]:
            pos += 1
        (u1, u2, cnt) = (ids2[i][0], ids2[i][1], pos - i)
        if cnt > 17:
            print "%s %s %f" % (u1, u2, cnt / (cnt + 2 * (20 - cnt) + 0.0))
        i = pos
Example #5
0
def get_docs(argv):
    urls_set = set()
    for path in argv[1:]:
        for doc in DocumentStreamReader(path):
            if doc.url not in urls_set:
                urls_set.add(doc.url)
                yield doc
    del urls_set
Example #6
0
def main():
    mhc = MinshinglesCounter(window=WINDOW, n=N)

    # groups = {}
    docurls = []
    docshingles = []
    for path in sys.argv[1:]:
        reader = DocumentStreamReader(path)
        for doc in reader:
            docurls.append(doc.url)
            index = len(docurls) - 1

            minshingles = mhc.count(doc.text)
            docshingles.append(minshingles)

            # if minshingles is None:
            #     print("None")
            # if minshingles is not None:
            #     for i in range(N):
            #         key = (i, minshingles[i])
            #         if not (key in groups):
            #             groups[key] = []
            #         groups[key].append(index)

    number = 0
    for i in range(len(docurls)):
        for j in range(i + 1, len(docurls)):
            if (docshingles[i] is not None) and (docshingles[j] is not None):
                s = 0
                for k in range(N):
                    if docshingles[i][k] in docshingles[j]:
                        s += 1

                temp = s / float(s + 2 * (N - s))
                if temp > 0.75:
                    number += 1
                    print "{} {} {}".format(docurls[i], docurls[j], temp)

    # similarity = {}
    # for key in groups:
    #     for i in range(len(groups[key])):
    #         for j in range(i + 1, len(groups[key])):
    #             s_key = (groups[key][i], groups[key][j])
    #             if s_key not in similarity:
    #                 similarity[s_key] = 0
    #             similarity[s_key] += 1
    #
    # number = 0
    # for key in similarity:
    #     temp = similarity[key] / float(similarity[key] + 2 * (N - similarity[key]))
    #     if temp > 0.75:
    #         number += 1
    #         print "{} {} {}".format(docurls[key[0]], docurls[key[1]], temp)
    #
    print "Total = " + str(number)
Example #7
0
def get_data():
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    terms = defaultdict(list)
    ind = 0
    urls = []
    for doc in reader:
        for word in set(doc2words.extract_words(doc.text)):
            terms[word].append(ind)
        ind += 1
        urls.append(doc.url)
    return terms, urls
Example #8
0
def create_index(args):
    reader = DocumentStreamReader(args[2:])
    if args[1] == 'varbyte':
        vocabulary = Vocabulary(Simple9)
    elif args[1] == 'simple9':
        vocabulary = Vocabulary(Simple9)
    else:
        raise AssertionError('Expected varbyte|simple9 as a compressor')

    for doc in reader:
        for word in extract_words(doc.text):
            vocabulary.append(word, doc.url)

    dump(args[0], vocabulary)
Example #9
0
def main():
    encMode = sys.argv[1]
    enc.changeMode(encMode)
    denc.changeMode(encMode)    
    with open(func.PATH + "urls.list", "a") as urls:
        docId = 0
        for path in sys.argv[2:]:
            for doc in DocumentStreamReader(path):
                urls.write(doc.url + "\n")
                parse(doc.text, docId)
                docId += 1
    flush()
    with open(func.PATH + "preDict.data", 'ab') as dct:
        dct.write(struct.pack('B', 1 if encMode == 'simple9' else 0))
Example #10
0
def make_dictionary_urlid():
    id_url = {}
    term_doc = {}
    reader = DocumentStreamReader(parse_command_line().files)
    i = 0
    for doc in reader:
        id_url[str(i)] = doc.url
        for word in extract_words(doc.text):
            if not (word in term_doc):
                term_doc[word] = []
                term_doc[word].append(i)
            elif term_doc[word][len(term_doc[word])-1] != i:
                term_doc[word].append(i)
        i += 1
    return term_doc, id_url
Example #11
0
def main(encoding, paths):
    reader = DocumentStreamReader(paths)

    if encoding == 'varbyte':
        encoder = VarbyteEncoder()
    elif encoding == 'simple9':
        encoder = Simple9Encoder()
    else:
        raise Exception("Unsupported encoding!")

    ct = time.clock()
    for doc in reader:
        url = doc.url
        words = set([w.encode('utf-8') for w in extract_words(doc.text)])
        encoder.add_document(url, words)

    encoder.write_to_file("index.txt")
    print "Time for index creation: {}".format(1000 * (time.clock() - ct))
def main():
    mhc = MinshinglesCounter()

    signatures = []
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            signatures.append((doc.url, mhc.count(doc.text)))
    size = len(signatures)
    for i in range(size):
        for j in range(i + 1, size):
            sig1 = signatures[i]
            sig2 = signatures[j]
            if sig1[1] is None or sig2[1] is None:
                continue
            shset1 = set(sig1[1])
            shset2 = set(sig2[1])
            jaccard = len(shset1 & shset2) / float(len(shset1 | shset2))
            if jaccard > 0.75:
                print ' '.join([sig1[0], sig2[0], str(jaccard)])
def main():
    mhc = MinshinglesCounter()

    res = {}
    doc_names = {}
    counter = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):

            counter += 1
            doc_names[counter] = doc.url
            minsh = mhc.count(doc.text)

            if minsh is None:
                continue

            for sh in minsh:
                tmp = res.pop(sh, [])
                tmp.append(counter)
                res[sh] = tmp

    count_pairs = {}
    for l in res:
        curr = res[l]
        for i in range(0, len(curr)):
            for j in range(i + 1, len(curr)):

                tmp = (curr[i], curr[j])

                if curr[i] > curr[j]:
                    tmp = (curr[j], curr[i])

                count_pairs[tmp] = count_pairs.pop(tmp, 0) + 1

    for (a, b) in count_pairs:

        m = count_pairs.get((a, b))
        if m is None or a == b or doc_names.get(a) == doc_names.get(b):
            continue
        l = (1.0 * m) / (mhc.n + mhc.n - m)
        if l > 0.75:
            print doc_names.get(a) + " " + doc_names.get(b) + " " + str(l)
Example #14
0
def main():
    data = []
    mhc = MinshinglesCounter()
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            mhc_c = mhc.count(doc.text)
            data.append((doc.url, mhc_c))

    for i in range(len(data)):
        if data[i][1] is None:
            continue
        for j in range(i + 1, len(data)):
            a = data[i][1]
            b = data[j][1]
            if b is None or data[i][0] == data[j][0]:
                continue
            ok = 0
            for x in a:
                if x in b:
                    ok += 1
            score = ok / float(ok + 2 * (20 - ok))
            if score >= 0.75:
                print("{} {} {}".format(data[i][0], data[j][0], score))
def main():
    minshingle_dim = 20
    mhc = MinshinglesCounter()
    ind_url = 0
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            if doc.url not in urls:
                urls[doc.url] = ind_url
                ind_url += 1
                mhc.count(doc.text, doc.url)
                #print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text, doc.url))

    global urls_intersect
    urls_intersect = numpy.zeros((len(urls), len(urls)))
    broder_merge()
    result_definition(minshingle_dim)
    """
    You may examine content of given files this way (as example):

    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text))
    """
    """
Example #16
0
def main():
    mhc = MinshinglesCounter()

    ids = []
    for path in sys.argv[1:]:
        for doc in DocumentStreamReader(path):
            lst = mhc.count(doc.text)
            if lst != None:
                ids.append((doc.url, lst))
        for i in range(len(ids)):
            for j in range(i + 1, len(ids)):
                sh1 = ids[i][1]
                sh2 = ids[j][1]
                u1 = ids[i][0]
                u2 = ids[j][0]
                if u1 == u2:
                    continue
                cnt = 0
                for x in sh1:
                    if x in sh2:
                        cnt += 1
                if cnt > 17:
                    print "%s %s %f" % (u1, u2, cnt / (cnt + 2 *
                                                       (20 - cnt) + 0.0))
Example #17
0
 def read_docs(self, pathlist):
     for path in pathlist:
         for doc in DocumentStreamReader(path):
             yield (doc.url, doc.text)
Example #18
0
import docreader
from docreader import DocumentStreamReader
import index_creation
import bitstream
import cPickle
import mmhash
import dict_hash

if __name__ == '__main__':
    reader = DocumentStreamReader(docreader.parse_command_line().files)
    index = index_creation.Url_Index()
    for doc in reader:
        index.scan_text(doc)
    blob = []
    term = dict()
    for k, v in index.terms.iteritems():
        prev_len = len(blob)
        compr = bitstream.compress_varbyte(v)
        blob.extend(compr)
        term[mmhash.get_unsigned_hash(
            k.encode('utf8'))] = [prev_len, len(compr)]

    index_file = open("index.txt", "wb")
    index_file.write(bytearray(blob))

    url_file = open("url_file.txt", "wb")
    cPickle.dump(index.url, url_file)

    dict_hash.store_dict(term)
Example #19
0
def count_docs(argv):
    urls_count = defaultdict(int)
    for path in argv[1:]:
        for doc in DocumentStreamReader(path):
            urls_count[doc.url] += 1
    return urls_count
Example #20
0
    pass


def get_wordid(term):
    res = words.get_from_dict(term)
    if res is None:
        global word_count
        word_count += 1
        words.add(term, word_count)
        return word_count
    return res


if __name__ == '__main__':
    cmd = parse_command_line()
    reader = DocumentStreamReader(cmd.files)
    if cmd.code[0] == "varbyte":
        index = VarByte("docindex")
    else:
        index = Simple9("docindex")

    doc_count = -1

    for doc in reader:
        doc_count += 1
        add_doc(doc.url)

        terms = set(extract_words(doc.text))

        for term in terms:
            tmp = get_wordid(term)
Example #21
0
#!/usr/bin/env python
import argparse
import document_pb2
import struct
import gzip
import sys

from docreader import DocumentStreamReader
from index_creation import Index


def parse_command_line():
    parser = argparse.ArgumentParser(description='compressed documents reader')
    parser.add_argument('args',
                        nargs='+',
                        help='Input files (.gz or plain) to process')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_command_line().args
    compression = args.pop(0)
    reader = DocumentStreamReader(args)
    indexer = Index()
    for doc_id, doc in enumerate(reader):
        indexer.handle_doc(doc, doc_id)
Example #22
0
    parsed_line = parse_command_line().files

    try:
        os.remove('varbyte.bin')
    except:
        pass

    try:
        os.remove('simple9.bin')
    except:
        pass

    if parsed_line[0] == 'varbyte':
        with open('varbyte.bin', 'wb') as f:
            f.write('a')
    else:
        with open('simple9.bin', 'wb') as f:
            f.write('a')

    reader = DocumentStreamReader(parsed_line[1:])

    cnt = 0
    for doc in reader:
        expand_back_index(doc, cnt)
        cnt += 1

    fout = open('back_index.bin', 'w')
    fids = open('ids.bin', 'w')
    save(fout, fids)
    fout.close()