def train(cls, corpus, sim_metric, feature_num=5, sim_model='weighted'): ''' Extract categories, features, feature weights, from corpus. Compute the weight for each feature token in each category The weight is computed as token_count / total_feature_count ''' print "Training..." cat_word = {} for sent, cat in corpus: cat_word.setdefault(cat, []).extend(word_process(word_tokenize(sent))) features = {cat: Counter(cat_word[cat]) for cat in cat_word} labels = features.keys() cat_features = {} feature_weights = {} for c, f in features.iteritems(): w_c_pairs = f.most_common(feature_num) words, counts = zip(*w_c_pairs) cat_features[c] = words total_count = float(sum(counts)) word_weights = [] for w, count in w_c_pairs: word_weights.append((w, count / total_count)) feature_weights[c] = word_weights return cls(labels, cat_features, feature_weights, sim_metric, sim_model)
def transform(self, X): tokenize = lambda x: word_process(word_tokenize(x)) X_tokens = map(tokenize, X) if self._model == 'onehot': return map(self.unigram_features, X_tokens) else: return map(self.sim_features, X_tokens)
def extract_features(self, corpus, feature_num=10): cat_word = {} for sent, cat in corpus: cat_word.setdefault(cat, []).extend(word_process(word_tokenize(sent))) features = {cat: Counter(cat_word[cat]) for cat in cat_word} feature_words = [] for c, f in features.iteritems(): words, counts = zip(*f.most_common(feature_num)) feature_words.extend(list(words)) feature_words = set(feature_words) return feature_words
def transform(self, X): tokenize = lambda x: lemmatization(word_tokenize(x)) X_tokens = map(tokenize, X) if self._model == 'onehot': return map(self.unigram_features, X_tokens) elif self._model == 'wordnet': return map(self.wordnet_features, X_tokens) elif self._model == 'word2vec': return map(self.word2vec_features, X_tokens) elif self._model == 'both': return map(self.semantic_features, X_tokens)
def train(cls, X, y, classifier=LinearSVC, model='bow'): """ :param X: :param y: :param classifier: :param model: bow or tfidf :return: """ tokenize = lambda x: lemmatization(word_tokenize(x)) labels = LabelEncoder() y_train = labels.fit_transform(y) vectorizer = CountVectorizer(tokenizer=tokenize) \ if model == 'bow' else TfidfVectorizer(tokenizer=tokenize) X_train = vectorizer.fit_transform(X) if isinstance(classifier, type): classifier = classifier() classifier.fit_transform(X_train, y_train) return cls(labels, vectorizer, classifier)
def classify_single(self, sent, feature_model='max'): """ The input feature words are compared to each category based on category similarity. Sum the semantic similarity score between features and category. The category having highest similarity score is the correct category. :param featuresets: feature sets such as word list :param method: specify the semantic similarity metric :param model: similarity combination model 'max', 'sum'. Default is 'max' :return: the correct category label. """ feature_words = list(set(word_process(word_tokenize(sent)))) score = {} for c in self._categories: if feature_model == 'max': score[c] = max([self.category_similarity(w, c) for w in feature_words] + [0.0]) else: score[c] = sum([self.category_similarity(w, c) for w in feature_words] + [0.0]) return Counter(score).most_common(1)[0][0]
def disambiguate_graph(self, sentence): words_origin = word_tokenize(sentence) #extract words that have a synset in WordNet, currently support NOUN. words = [w for w in words_origin if self._wn_sim.word2synset(w)] # map words to synsets words_synsets = {w:self._wn_sim.word2synset(w) for w in words} # construct sets list synsets = list(itertools.chain.from_iterable([words_synsets[w] for w in words])) # remove duplicate synsets synsets = list(set(synsets)) # define semantic similarity metric sim_metric = lambda x, y: self._wn_sim.similarity(x, y, self._sim_name) # construct similarity graphs sim_graph = SimGraph(synsets, sim_metric) # get pagerank scores of synsets rank_scores = sim_graph.page_rank() results = [] for w in words_origin: if w in words: candidate_scores = {s:rank_scores[s] for s in words_synsets[w]} results.append((w, Counter(candidate_scores).most_common(1)[0][0])) else: results.append((w, None)) return results
def extract_words(self, text): return lemmatization(word_tokenize(text))
def gloss_overlap(self, c1, c2): gloss1 = lemmatization(word_tokenize(c1.definition())) gloss2 = lemmatization(word_tokenize(c2.definition())) gloss1 = set(map(porter.stem, gloss1)) gloss2 = set(map(porter.stem, gloss2)) return len(gloss1.intersection(gloss2))
num_features=tfidf_corpus.num_terms) tfidf_index.num_best = top_N tfidf_index.save(save_dir + 'tfidf_index/tfidf.index') if model == 'lsa': lsa = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=topic_n) lsa.save(save_dir + 'lsa.model') lsa_index = similarities.Similarity(save_dir + 'lsa_index/shard', lsa[tfidf_corpus], num_features=topic_n) lsa_index.num_best = top_N lsa_index.save(save_dir + 'lsa_index/lsa.index') return cls(text_process, model, dictionary, tfidf, tfidf_index, lsa, lsa_index) return cls(text_process, model, dictionary, tfidf, tfidf_index, None, None) try: data = read_data(DATA_FILE) except: prepare_entities() data = read_data(DATA_FILE) TextAnalysis.train([d['abstract'] for _, d in enumerate(data)], lambda x: word_process(word_tokenize(x))) text_tfidf = TextAnalysis.load(lambda x: word_process(word_tokenize(x))) text_lsa = TextAnalysis.load(lambda x: word_process(word_tokenize(x)), model='lsa')
def tokenize(x): return word_process(word_tokenize(x)) X_tokens = map(tokenize, X)
def context2words(self, sent): words = word_tokenize(sent.lower()) words = [w for w in words if len(w) > 2] return lemmatization(words)