Ejemplo n.º 1
0
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, 'r') as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode('utf8')] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print 'Loaded ', len(self.word_list), 'words'

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode('utf8') for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode('utf8') for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)
Ejemplo n.º 2
0
class FeatureContainer:
    def __init__(self, word_dict, keyword_dict=None):
        # Load word list
        self.word_list = []
        self.word_dict = {}
        l = [(value, key) for key, value in word_dict.items()]
        l = sorted(l, reverse=True)
        for idx, (value, key) in enumerate(l):
            self.word_list.append(key)
            self.word_dict[key.decode('utf8')] = idx

        self.fb = FeatureBuilder(self.word_dict, keyword_dict)
        self.smb = SimhashBuilder(self.word_list)
        self.mnb = MinhashBuilder()
        print 'FeatureContainer OK'

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode('utf8') for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode('utf8') for word in new_words])
        feature_vec = self.fb.compute(token_list)
        sim_hash, hash_vec = self.smb.sim_hash_nonzero(feature_vec)
        min_hash = self.mnb.min_hash(hash_vec)
        return feature_vec, sim_hash, min_hash
Ejemplo n.º 3
0
class FeatureContainer:
    def __init__(self, word_dict_path):
        # Load word list
        self.word_dict_path = word_dict_path
        self.word_list = []
        with open(word_dict_path, "r") as ins:
            for line in ins.readlines():
                self.word_list.append(line.split()[1])
        self.word_dict = {}
        for idx, ascword in enumerate(self.word_list):
            self.word_dict[ascword.decode("utf8")] = idx
        self.fb = FeatureBuilder(self.word_dict)
        self.smb = SimhashBuilder(self.word_list)
        print "Loaded ", len(self.word_list), "words"

    def compute_feature(self, token_list):
        new_words = []
        for token in token_list:
            if not token in self.word_dict:
                new_words.append(token)
        if len(new_words) != 0:
            # Update word_list and word_dict
            self.fb.update_words(new_words)
            self.smb.update_words([word.encode("utf8") for word in new_words])
            self.word_dict = self.fb.word_dict
            self.word_list.extend([word.encode("utf8") for word in new_words])
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)