def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None
class TextPreprocessor(BaseEstimator, TransformerMixin): """ Transform input text into feature representation """ def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None def fit(self, X, y=None): return self def inverse_transform(self, X): return X def extract_features(self, corpus, feature_num=10): cat_word = {} for sent, cat in corpus: cat_word.setdefault(cat, []).extend(lemmatization(word_tokenize(sent))) features = {cat: Counter(cat_word[cat]) for cat in cat_word} feature_words = [] for c, f in features.iteritems(): words, counts = zip(*f.most_common(feature_num)) feature_words.extend(list(words)) feature_words = set(feature_words) return feature_words def similarity(self, tokens, feature, method='wordnet'): if method == 'wordnet': sim = lambda x: self._wns.word_similarity(feature, x, self. _wn_method) else: sim = lambda x: self._wvs.word_similarity(feature, x) return max(map(sim, tokens) + [0.0]) def unigram_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['contains({})'.format(f)] = (f in words) return features def wordnet_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) return features def word2vec_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def semantic_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def transform(self, X): tokenize = lambda x: lemmatization(word_tokenize(x)) X_tokens = map(tokenize, X) if self._model == 'onehot': return map(self.unigram_features, X_tokens) elif self._model == 'wordnet': return map(self.wordnet_features, X_tokens) elif self._model == 'word2vec': return map(self.word2vec_features, X_tokens) elif self._model == 'both': return map(self.semantic_features, X_tokens)