def get_data(): reader = DocumentStreamReader(docreader.parse_command_line().files) terms = defaultdict(list) ind = 0 urls = [] for doc in reader: for word in set(doc2words.extract_words(doc.text)): terms[word].append(ind) ind += 1 urls.append(doc.url) return terms, urls
def create_index(args): reader = DocumentStreamReader(args[2:]) if args[1] == 'varbyte': vocabulary = Vocabulary(Simple9) elif args[1] == 'simple9': vocabulary = Vocabulary(Simple9) else: raise AssertionError('Expected varbyte|simple9 as a compressor') for doc in reader: for word in extract_words(doc.text): vocabulary.append(word, doc.url) dump(args[0], vocabulary)
def main(): encMode = sys.argv[1] enc.changeMode(encMode) denc.changeMode(encMode) with open(func.PATH + "urls.list", "a") as urls: docId = 0 for path in sys.argv[2:]: for doc in DocumentStreamReader(path): urls.write(doc.url + "\n") parse(doc.text, docId) docId += 1 flush() with open(func.PATH + "preDict.data", 'ab') as dct: dct.write(struct.pack('B', 1 if encMode == 'simple9' else 0))
def make_dictionary_urlid(): id_url = {} term_doc = {} reader = DocumentStreamReader(parse_command_line().files) i = 0 for doc in reader: id_url[str(i)] = doc.url for word in extract_words(doc.text): if not (word in term_doc): term_doc[word] = [] term_doc[word].append(i) elif term_doc[word][len(term_doc[word])-1] != i: term_doc[word].append(i) i += 1 return term_doc, id_url
def main(encoding, paths): reader = DocumentStreamReader(paths) if encoding == 'varbyte': encoder = VarbyteEncoder() elif encoding == 'simple9': encoder = Simple9Encoder() else: raise Exception("Unsupported encoding!") ct = time.clock() for doc in reader: url = doc.url words = set([w.encode('utf-8') for w in extract_words(doc.text)]) encoder.add_document(url, words) encoder.write_to_file("index.txt") print "Time for index creation: {}".format(1000 * (time.clock() - ct))
def main(): mhc = MinshinglesCounter() signatures = [] for path in sys.argv[1:]: for doc in DocumentStreamReader(path): signatures.append((doc.url, mhc.count(doc.text))) size = len(signatures) for i in range(size): for j in range(i + 1, size): sig1 = signatures[i] sig2 = signatures[j] if sig1[1] is None or sig2[1] is None: continue shset1 = set(sig1[1]) shset2 = set(sig2[1]) jaccard = len(shset1 & shset2) / float(len(shset1 | shset2)) if jaccard > 0.75: print ' '.join([sig1[0], sig2[0], str(jaccard)])
def main(): mhc = MinshinglesCounter() res = {} doc_names = {} counter = 0 for path in sys.argv[1:]: for doc in DocumentStreamReader(path): counter += 1 doc_names[counter] = doc.url minsh = mhc.count(doc.text) if minsh is None: continue for sh in minsh: tmp = res.pop(sh, []) tmp.append(counter) res[sh] = tmp count_pairs = {} for l in res: curr = res[l] for i in range(0, len(curr)): for j in range(i + 1, len(curr)): tmp = (curr[i], curr[j]) if curr[i] > curr[j]: tmp = (curr[j], curr[i]) count_pairs[tmp] = count_pairs.pop(tmp, 0) + 1 for (a, b) in count_pairs: m = count_pairs.get((a, b)) if m is None or a == b or doc_names.get(a) == doc_names.get(b): continue l = (1.0 * m) / (mhc.n + mhc.n - m) if l > 0.75: print doc_names.get(a) + " " + doc_names.get(b) + " " + str(l)
def main(): data = [] mhc = MinshinglesCounter() for path in sys.argv[1:]: for doc in DocumentStreamReader(path): mhc_c = mhc.count(doc.text) data.append((doc.url, mhc_c)) for i in range(len(data)): if data[i][1] is None: continue for j in range(i + 1, len(data)): a = data[i][1] b = data[j][1] if b is None or data[i][0] == data[j][0]: continue ok = 0 for x in a: if x in b: ok += 1 score = ok / float(ok + 2 * (20 - ok)) if score >= 0.75: print("{} {} {}".format(data[i][0], data[j][0], score))
def main(): mhc = MinshinglesCounter() ids = [] for path in sys.argv[1:]: for doc in DocumentStreamReader(path): lst = mhc.count(doc.text) if lst != None: ids.append((doc.url, lst)) for i in range(len(ids)): for j in range(i + 1, len(ids)): sh1 = ids[i][1] sh2 = ids[j][1] u1 = ids[i][0] u2 = ids[j][0] if u1 == u2: continue cnt = 0 for x in sh1: if x in sh2: cnt += 1 if cnt > 17: print "%s %s %f" % (u1, u2, cnt / (cnt + 2 * (20 - cnt) + 0.0))
def main(): minshingle_dim = 20 mhc = MinshinglesCounter() ind_url = 0 for path in sys.argv[1:]: for doc in DocumentStreamReader(path): if doc.url not in urls: urls[doc.url] = ind_url ind_url += 1 mhc.count(doc.text, doc.url) #print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text, doc.url)) global urls_intersect urls_intersect = numpy.zeros((len(urls), len(urls))) broder_merge() result_definition(minshingle_dim) """ You may examine content of given files this way (as example): for path in sys.argv[1:]: for doc in DocumentStreamReader(path): print "%s (text length: %d, minhashes: %s)" % (doc.url, len(doc.text), mhc.count(doc.text)) """ """
import docreader from docreader import DocumentStreamReader import index_creation import bitstream import cPickle import mmhash import dict_hash if __name__ == '__main__': reader = DocumentStreamReader(docreader.parse_command_line().files) index = index_creation.Url_Index() for doc in reader: index.scan_text(doc) blob = [] term = dict() for k, v in index.terms.iteritems(): prev_len = len(blob) compr = bitstream.compress_varbyte(v) blob.extend(compr) term[mmhash.get_unsigned_hash( k.encode('utf8'))] = [prev_len, len(compr)] index_file = open("index.txt", "wb") index_file.write(bytearray(blob)) url_file = open("url_file.txt", "wb") cPickle.dump(index.url, url_file) dict_hash.store_dict(term)
pass def get_wordid(term): res = words.get_from_dict(term) if res is None: global word_count word_count += 1 words.add(term, word_count) return word_count return res if __name__ == '__main__': cmd = parse_command_line() reader = DocumentStreamReader(cmd.files) if cmd.code[0] == "varbyte": index = VarByte("docindex") else: index = Simple9("docindex") doc_count = -1 for doc in reader: doc_count += 1 add_doc(doc.url) terms = set(extract_words(doc.text)) for term in terms: tmp = get_wordid(term)
#!/usr/bin/env python import argparse import document_pb2 import struct import gzip import sys from docreader import DocumentStreamReader from index_creation import Index def parse_command_line(): parser = argparse.ArgumentParser(description='compressed documents reader') parser.add_argument('args', nargs='+', help='Input files (.gz or plain) to process') return parser.parse_args() if __name__ == '__main__': args = parse_command_line().args compression = args.pop(0) reader = DocumentStreamReader(args) indexer = Index() for doc_id, doc in enumerate(reader): indexer.handle_doc(doc, doc_id)
parsed_line = parse_command_line().files try: os.remove('varbyte.bin') except: pass try: os.remove('simple9.bin') except: pass if parsed_line[0] == 'varbyte': with open('varbyte.bin', 'wb') as f: f.write('a') else: with open('simple9.bin', 'wb') as f: f.write('a') reader = DocumentStreamReader(parsed_line[1:]) cnt = 0 for doc in reader: expand_back_index(doc, cnt) cnt += 1 fout = open('back_index.bin', 'w') fids = open('ids.bin', 'w') save(fout, fids) fout.close()
def count_docs(argv): urls_count = defaultdict(int) for path in argv[1:]: for doc in DocumentStreamReader(path): urls_count[doc.url] += 1 return urls_count
def read_docs(self, pathlist): for path in pathlist: for doc in DocumentStreamReader(path): yield (doc.url, doc.text)