Example #1
0
class FeatureContainer:
    def __init__(self, word_dict, keyword_dict=None):
        # Load word list
        self.word_list = []
        self.word_dict = {}
        l = [(value, key) for key, value in word_dict.items()]
        l = sorted(l, reverse=True)
        for idx, (value, key) in enumerate(l):
            self.word_list.append(key)
            self.word_dict[key.decode('utf8')] = idx

        self.fb = FeatureBuilder(self.word_dict, keyword_dict)
        self.smb = SimhashBuilder(self.word_list)
        self.mnb = MinhashBuilder()
        print 'FeatureContainer OK'

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode('utf8') for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode('utf8') for word in new_words])
        feature_vec = self.fb.compute(token_list)
        sim_hash, hash_vec = self.smb.sim_hash_nonzero(feature_vec)
        min_hash = self.mnb.min_hash(hash_vec)
        return feature_vec, sim_hash, min_hash
Example #2
0
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, 'r') as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode('utf8')] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print 'Loaded ', len(self.word_list), 'words'

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode('utf8') for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode('utf8') for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, "r") as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode("utf8")] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print "Loaded ", len(self.word_list), "words"

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode("utf8") for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode("utf8") for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)
Example #4
0
    # 分词 tokens返回分词后的数组
    doc_token_1 = jt.tokens(doc_data_1)
    print 'Loading word dict...'
    # 加载字典并构建词典
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    # 构建非0特征向量
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(
        doc_token_1
    )  # return feature_nonzero得到一个非0 长度的向量 ,元素为(idx,value)且 value > 0
    #使得字典中的每个值都有一个hash值,
    smb = SimhashBuilder(word_list)
    doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
    #测试文件,用于调研算法
    out_file = open('/home/lin.xiong/lsh_data/out.file', 'w')
    #fp_set = set()
    fp_arr = []
    fp_post_id_dict = {}
    with open('/home/lin.xiong/lsh_data/lsh_clear.fingerprint', 'r') as fp:
        for line in fp:
            fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0]
            fp_arr.append(long(line.split('\t')[1]))
    comment = []
    with open('/home/lin.xiong/lsh_data/lsh.data', 'r') as comment_file:
Example #5
0
    doc_token_2 = jt.tokens(doc_data_2)

    print 'Loading word dict...'
    # Load word list from word_dict
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])

    # Build unicode string word dict
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
        # Build nonzero-feature
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(doc_token_1)
    doc_feat_2 = fb.compute(doc_token_2)

    # Init simhash_builder
    smb = SimhashBuilder(word_list)

    doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
    doc_fl_2 = DocFeatLoader(smb, doc_feat_2)

    if mode == '-c':
        print 'Matching by VSM + cosine distance'
        dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False)
        if dist > float(threshold):
            print 'Matching Result:\t<True:%s>' % dist
        else:
            print 'Matching Result:\t<False:%s>' % dist
Example #6
0
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):
         if lineidx != 642:
             continue
         # Tokenize
         tokens = jt.tokens(line.strip().decode('utf8'))
         # Compute text feature
         feature = fb.compute(tokens)
         # Compute simhash
         fingerprint = smb.sim_hash(feature)
         result_list = []
         for idx, fp in enumerate(fingerprint_list):
             sim = hamming_distance(fingerprint, fp, 64)
             result_list.append((sim, idx))
         result_list = sorted(result_list, cmp=lambda x,y: cmp(x[0],y[0]))
         if result_list[0][0] < min_sim:
             min_sim, min_docid = result_list[0][0], lineidx
         #'''
         with open(sys.argv[6], 'w') as outs:
             outs.write(line.strip()+os.linesep)
             for sim, idx in result_list:
                 outs.write('%s\t%s%s' %(sim, doc_list[idx], os.linesep)) 
         #'''
Example #7
0
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):
         if lineidx != 642:
             continue
         # Tokenize
         tokens = jt.tokens(line.strip().decode('utf8'))
         # Compute text feature
         feature = fb.compute(tokens)
         # Compute simhash
         fingerprint = smb.sim_hash(feature)
         result_list = []
         for idx, fp in enumerate(fingerprint_list):
             sim = hamming_distance(fingerprint, fp, 64)
             result_list.append((sim, idx))
         result_list = sorted(result_list, cmp=lambda x, y: cmp(x[0], y[0]))
         if result_list[0][0] < min_sim:
             min_sim, min_docid = result_list[0][0], lineidx
         #'''
         with open(sys.argv[6], 'w') as outs:
             outs.write(line.strip() + os.linesep)
             for sim, idx in result_list:
                 outs.write('%s\t%s%s' % (sim, doc_list[idx], os.linesep))
         #'''
Example #8
0
    doc_token_2 = jt.tokens(doc_data_2)

    print 'Loading word dict...'
    # Load word list from word_dict
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])

    # Build unicode string word dict
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
        # Build nonzero-feature
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(doc_token_1)
    doc_feat_2 = fb.compute(doc_token_2)

    # Init simhash_builder
    smb = SimhashBuilder(word_list)

    doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
    doc_fl_2 = DocFeatLoader(smb, doc_feat_2)

    if mode == '-c':
        print 'Matching by VSM + cosine distance'
        dist = cosine_distance_nonzero(doc_fl_1.feat_vec,
                                       doc_fl_2.feat_vec,
                                       norm=False)
        if dist > float(threshold):
            print 'Matching Result:\t<True:%s>' % dist