Beispiel #1
0
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, 'r') as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode('utf8')] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print 'Loaded ', len(self.word_list), 'words'

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode('utf8') for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode('utf8') for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, "r") as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode("utf8")] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print "Loaded ", len(self.word_list), "words"

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode("utf8") for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode("utf8") for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)
Beispiel #3
0
 def __init__(self, word_dict_path):
     # Load word list
     self.word_dict_path = word_dict_path
     self.word_list = []
     with open(word_dict_path, 'r') as ins:
         for line in ins.readlines():
             self.word_list.append(line.split()[1])
     self.word_dict = {}
     for idx, ascword in enumerate(self.word_list):
         self.word_dict[ascword.decode('utf8')] = idx
     self.fb = FeatureBuilder(self.word_dict)
     self.smb = SimhashBuilder(self.word_list)
     print 'Loaded ', len(self.word_list), 'words'
Beispiel #4
0
def preProcessingData(filename):
    loadData(filename)
    jt_time = time.time()
    global jt
    jt = JiebaTokenizer(stopwords_path, 'c')
    end_jt_time = time.time()
    print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time))
    # 根据所有的标注数据做词向量模型 生成词典
    wordList, wordDict = buildWords(jt, labelContents)
    end_build_time = time.time()
    print('buildWords time: %s' % str(end_build_time - end_jt_time))
    # 生成特征向量
    global fb
    fb = FeatureBuilder(wordDict)
    end_fb_build_time = time.time()
    print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time))
    # 生成指纹
    global smb
    smb = SimhashBuilder(wordList)
    end_smb_build_time = time.time()
    print('SimhashBuilder time: %s' %
          str(end_smb_build_time - end_fb_build_time))
    # 生成所有标注数据的特征向量
    for flowId, processLabelDataMap in processFlowMap.items():
        processFlowMap[flowId] = generateDocFeatureVector(
            processLabelDataMap, jt, fb, smb)
    end_docFV_time = time.time()
    print('generateDocFeatureVector time: %s' %
          str(end_docFV_time - end_smb_build_time))
 def __init__(self, word_dict_path):
     # Load word list
     self.word_dict_path = word_dict_path
     self.word_list = []
     with open(word_dict_path, "r") as ins:
         for line in ins.readlines():
             self.word_list.append(line.split()[1])
     self.word_dict = {}
     for idx, ascword in enumerate(self.word_list):
         self.word_dict[ascword.decode("utf8")] = idx
     self.fb = FeatureBuilder(self.word_dict)
     self.smb = SimhashBuilder(self.word_list)
     print "Loaded ", len(self.word_list), "words"
Beispiel #6
0
 print 'Loading word dict...'
 # 加载字典并构建词典
 word_list = []
 with open(word_dict, 'r') as ins:
     for line in ins.readlines():
         word_list.append(line.split()[1])
 word_dict = {}
 for idx, ascword in enumerate(word_list):
     word_dict[ascword.decode('utf8')] = idx
 # 构建非0特征向量
 fb = FeatureBuilder(word_dict)
 doc_feat_1 = fb.compute(
     doc_token_1
 )  # return feature_nonzero得到一个非0 长度的向量 ,元素为(idx,value)且 value > 0
 #使得字典中的每个值都有一个hash值,
 smb = SimhashBuilder(word_list)
 doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
 #测试文件,用于调研算法
 out_file = open('/home/lin.xiong/lsh_data/out.file', 'w')
 #fp_set = set()
 fp_arr = []
 fp_post_id_dict = {}
 with open('/home/lin.xiong/lsh_data/lsh_clear.fingerprint', 'r') as fp:
     for line in fp:
         fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0]
         fp_arr.append(long(line.split('\t')[1]))
 comment = []
 with open('/home/lin.xiong/lsh_data/lsh.data', 'r') as comment_file:
     for line in comment_file:
         comment.append(line.strip().split('$&&$')[1])
 fp_comment_tup = zip(fp_arr, comment)
Beispiel #7
0
     print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path"
     exit(-1)
 # Load word list
 word_list = []
 with open(sys.argv[1], 'r') as ins:
     for line in ins.readlines():
         word_list.append(line.split()[1])
 # Init tokenizer
 jt = JiebaTokenizer(sys.argv[2], 'c')
 # Init feature_builder
 word_dict = {}
 for idx, ascword in enumerate(word_list):
     word_dict[ascword.decode('utf8')] = idx
 fb = FeatureBuilder(word_dict)
 # Init simhash_builder
 smb = SimhashBuilder(word_list)
 # Load fingerprint list
 fingerprint_list = []
 with open(sys.argv[3], 'r') as ins:
     for line in ins.readlines():
         fingerprint_list.append(int(line))
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):
Beispiel #8
0
     print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path"
     exit(-1)
 # Load word list
 word_list = []
 with open(sys.argv[1], 'r') as ins:
     for line in ins.readlines():
         word_list.append(line.split()[1])
 # Init tokenizer
 jt = JiebaTokenizer(sys.argv[2], 'c')
 # Init feature_builder
 word_dict = {}
 for idx, ascword in enumerate(word_list):
     word_dict[ascword.decode('utf8')] = idx
 fb = FeatureBuilder(word_dict)
 # Init simhash_builder
 smb = SimhashBuilder(word_list)
 # Load fingerprint list
 fingerprint_list = []
 with open(sys.argv[3], 'r') as ins:
     for line in ins.readlines():
         fingerprint_list.append(int(line))
 # For exp: load document content
 doc_list = []
 with open(sys.argv[4], 'r') as ins:
     for line in ins.readlines():
         doc_list.append(line.strip())
 # Detection process begins
 min_sim = 64
 min_docid = 0
 with open(sys.argv[5], 'r') as ins:
     for lineidx, line in enumerate(ins.readlines()):