def __init__(self, x, y, min_word_length=0, min_word_freq=0, num_of_features_of_each_class=100, sw=True): self.weights = {'tf', 'tfidf'} self.num_of_docs = len(x) self.classes = set(y) word_count = self.read_in_all_words(x, min_word_length, min_word_freq, sw) self.class_word_count = self.get_class_word_count(x, y, word_count) self.features = self.select_features(word_count, self.class_word_count, num_of_features_of_each_class) self.x_le = LabelEncoder().fit(self.features) self.y_le = LabelEncoder().fit(y)
class Features: def __init__(self, x, y, min_word_length=0, min_word_freq=0, num_of_features_of_each_class=100, sw=True): self.weights = {'tf', 'tfidf'} self.num_of_docs = len(x) self.classes = set(y) word_count = self.read_in_all_words(x, min_word_length, min_word_freq, sw) self.class_word_count = self.get_class_word_count(x, y, word_count) self.features = self.select_features(word_count, self.class_word_count, num_of_features_of_each_class) self.x_le = LabelEncoder().fit(self.features) self.y_le = LabelEncoder().fit(y) def read_in_all_words(self, x, min_word_len=0, min_to_count=0, sw=True): """ Get all vocabularies that can be the feature :param x: A list of files to read. :return: A set of under_conditions_words that contained in x """ stopwords.add("") # to remove empty word vocabularies = Counter() # Count the vocabularies voc = Counter() # the vocabularies that we finally need for name in x: file = open(name, 'r', errors='ignore') # There's some decoding errors word_mark = set() # Mark if the word has been count for line in file: for word in r.split(line): # strip the punctuation word = word.lower() # lower case if (not sw or word not in stopwords) and len(word) >= min_word_len: # ignore the words that too short if word not in vocabularies: vocabularies[word] = [1, 1] word_mark.add(word) else: vocabularies[word][0] += 1 vocabularies[word][1] += 1 if word not in word_mark else 0 if vocabularies[word.lower()][0] >= min_to_count: # ignore the low frequency word voc[word] = vocabularies[word] file.close() return voc @staticmethod def cal_chi_square(n11, n10, n01, n00): return (n11 + n10 + n01 + n00) * (n11 * n00 - n10 * n01)**2 /\ ((n11+n01) * (n11 + n10) * (n10 + n00) * (n01 + n00)) def get_class_word_count(self, x, y, voc=None): """ Count the word frequency in every classifications. :param x: files to be count :param y: classifications related to x # :param vocabularies: features :return: a set of Counters that counts the word frequency """ class_word_count = {} for key in set(y): class_word_count[key] = Counter() # init for ii in range(len(x)): file = open(x[ii], 'r', errors='ignore') for line in file: for word in r.split(line): word = word.lower() if not voc or word in voc: class_word_count[y[ii]][word] += 1 # count return class_word_count def select_features(self, word_count, class_word_count, num): """Select features. :param x: a list of files :param y: the classification related to x :return: a set of word to be features """ features = {} total_word_count = {} for cls in self.classes: features[cls] = [(0, '')] * num total_word_count[cls] = sum(class_word_count[cls].values()) for word in word_count: for cls in self.classes: n11 = class_word_count[cls][word] n10 = word_count[word][0] - n11 # This word but not in classifier n01 = total_word_count[cls] - n11 # Not this word but in classifier n00 = 0 for c in self.classes: if c != cls: n00 += total_word_count[cls] - class_word_count[cls][word] chi_s = Features.cal_chi_square(n11, n10, n01, n00) if chi_s > features[cls][0][0]: heappushpop(features[cls], (chi_s, word)) feature_set = {} for cls in self.classes: for c_s, _word in features[cls]: # print(c_s, _word) if _word: feature_set[_word] = word_count[_word][1] return feature_set def read_file(self, name): file = open(name, 'r', errors='ignore') word_set = set() for line in file: for word in r.split(line): word = word.lower() if word in self.features: word_set.add(word) file.close() return word_set def get_x_vector(self, name, weight='tf'): if weight not in self.weights: print("Error: weight not found!") return file = open(name, 'r', errors='ignore') word_array = self.cal_tf(file) if weight == 'tfidf': word_array = self.cal_tfidf(word_array) file.close() return word_array def cal_tf(self, file): word_array = [0] * len(self.features) for line in file: for word in r.split(line): word = word.lower() if word in self.features: word_array[self.x_transform(word)] += 1 return word_array def cal_tfidf(self, word_array): for i in range(len(self.features)): word_array[i] = 0 if not word_array[i] else \ (log2(word_array[i])+1)*log2(self.num_of_docs/self.features[self.x_le.inverse_transform(i)]) return word_array def x_transform(self, thing): return self.x_le.transform(thing) def y_transform(self, thing): return self.y_le.transform(thing)