for line in fp: fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0] fp_arr.append(long(line.split('\t')[1])) comment = [] with open('/home/lin.xiong/lsh_data/lsh.data', 'r') as comment_file: for line in comment_file: comment.append(line.strip().split('$&&$')[1]) fp_comment_tup = zip(fp_arr, comment) fp_comment_dict = dict(fp_comment_tup) if mode == '-s': print 'Matching by Simhash + hamming distance' #---------------------------------------------------------------------- tmp_dic = {} start_millis = int(round(time.time() * 1000)) for fp in fp_arr: dist = hamming_distance(doc_fl_1.fingerprint, fp) tmp_dic[fp] = dist end_millis = int(round(time.time() * 1000)) print end_millis - start_millis #------------------------------------------------------------------------ dict_sorted = sorted(tmp_dic.items(), key=lambda d: d[1]) concat = 0 for fp_dist_tup in dict_sorted: if concat <= 99: bin_doc_1 = list(bin(doc_fl_1.fingerprint)) print len(bin_doc_1), bin_doc_2 = list(bin(fp_dist_tup[0])) print len(bin_doc_2), bin_zip = zip(bin_doc_1, bin_doc_2) cnt = 0 for bin1_bin2_tup in bin_zip:
# Detection process begins min_sim = 64 min_docid = 0 with open(sys.argv[5], 'r') as ins: for lineidx, line in enumerate(ins.readlines()): if lineidx != 642: continue # Tokenize tokens = jt.tokens(line.strip().decode('utf8')) # Compute text feature feature = fb.compute(tokens) # Compute simhash fingerprint = smb.sim_hash(feature) result_list = [] for idx, fp in enumerate(fingerprint_list): sim = hamming_distance(fingerprint, fp, 64) result_list.append((sim, idx)) result_list = sorted(result_list, cmp=lambda x,y: cmp(x[0],y[0])) if result_list[0][0] < min_sim: min_sim, min_docid = result_list[0][0], lineidx #''' with open(sys.argv[6], 'w') as outs: outs.write(line.strip()+os.linesep) for sim, idx in result_list: outs.write('%s\t%s%s' %(sim, doc_list[idx], os.linesep)) #''' #if lineidx == 2: # break print min_sim, min_docid
# Build unicode string word dict word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx # Build nonzero-feature fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute(doc_token_1) doc_feat_2 = fb.compute(doc_token_2) # Init simhash_builder smb = SimhashBuilder(word_list) doc_fl_1 = DocFeatLoader(smb, doc_feat_1) doc_fl_2 = DocFeatLoader(smb, doc_feat_2) if mode == '-c': print 'Matching by VSM + cosine distance' dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False) if dist > float(threshold): print 'Matching Result:\t<True:%s>' % dist else: print 'Matching Result:\t<False:%s>' % dist elif mode == '-s': print 'Matching by Simhash + hamming distance' dist = hamming_distance(doc_fl_1.fingerprint, doc_fl_2.fingerprint) if dist < float(threshold): print 'Matching Result:\t<True:%s>' % dist else: print 'Matching Result:\t<False:%s>' % dist
doc_list.append(line.strip()) # Detection process begins min_sim = 64 min_docid = 0 with open(sys.argv[5], 'r') as ins: for lineidx, line in enumerate(ins.readlines()): if lineidx != 642: continue # Tokenize tokens = jt.tokens(line.strip().decode('utf8')) # Compute text feature feature = fb.compute(tokens) # Compute simhash fingerprint = smb.sim_hash(feature) result_list = [] for idx, fp in enumerate(fingerprint_list): sim = hamming_distance(fingerprint, fp, 64) result_list.append((sim, idx)) result_list = sorted(result_list, cmp=lambda x, y: cmp(x[0], y[0])) if result_list[0][0] < min_sim: min_sim, min_docid = result_list[0][0], lineidx #''' with open(sys.argv[6], 'w') as outs: outs.write(line.strip() + os.linesep) for sim, idx in result_list: outs.write('%s\t%s%s' % (sim, doc_list[idx], os.linesep)) #''' #if lineidx == 2: # break print min_sim, min_docid