def train_word2vec(categories, comments, n_dim): from feature_extraction import tokenize_document from feature_extraction import word2vec_model from sklearn.linear_model import SGDClassifier documents = [tokenize_document(document) for document in comments] model = word2vec_model(documents, n_dim) train_vecs = w2vectorize(documents, model, n_dim) classifier = SGDClassifier(loss='log', penalty='l1') classifier.fit(train_vecs, categories) return model, classifier
print "\nSamples: " import pprint printer = pprint.PrettyPrinter(indent=4) printer.pprint(model["sound"].samples()) print "\n" model = word2vec_model(comments) print model.similarity('retarded', 'loser') if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'): from train import train_word2vec from train import w2vectorize from feature_extraction import tokenize_document model, classifier = train_word2vec(categories, comments, 500) test_documents = [ tokenize_document(document, stopwords='english') for document in test_comments ] test_vecs = w2vectorize(test_documents, model, 500) predictions = classifier.predict(test_vecs) print "\nWord2Vec Model Result\n" prediction_info(predictions, test_categories) if config_parser.getboolean(EXECUTION_SECTION, 'Final'): from train import train_assembling classifier = train_assembling(categories, comments, badwords) predictions = classifier.predict(test_comments) print "\nFeature Ensemble Model Result\n" prediction_info(predictions, test_categories)
def tokenize_collection(collection, lowercase=True, stopwords=True, min_length=3): documents = [tokenize_document(document, lowercase=lowercase, stopwords=stopwords, min_length=min_length) for document in collection] words = [token for document in documents for token in document] return words
def transform(self, documents): import enchant import sentlex from feature_extraction import tokenize_document d = enchant.Dict("en_US") swn = sentlex.SWN3Lexicon() tokenized_documents = [tokenize_document(document) for document in documents] n_words = [] n_chars = [] # number of uppercase words all_caps = [] n_bad = [] exclamation = [] addressing = [] n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents] sent_pos = [] sent_neg = [] n_you_re = [] for comment in documents: n_words.append(len(comment.split())) n_chars.append(len(comment)) all_caps.append(np.sum([w.isupper() for w in comment.split()])) n_bad.append(comment.count('fakeinsult')) exclamation.append(comment.count("!")) addressing.append(comment.count("@")) doc = nlp(comment) count = 0. pos_sum = 0. neg_sum = 0. for token in doc: if token.text == 'fakeinsult': pos_sum += 0. neg_sum += 1. count += 1. continue if token.pos_.startswith('RB'): sentiment = swn.getadverb(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. elif token.pos_.startswith('NN'): sentiment = swn.getnoun(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if token.pos_.startswith('JJ'): sentiment = swn.getadjective(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if token.pos_.startswith('VB'): sentiment = swn.getverb(token.text) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if count != 0: pos_sum /= count neg_sum /= count sent_neg.append(neg_sum) sent_pos.append(pos_sum) matches = self.__matcher(doc) n_you_re.append(len(matches)) allcaps_ratio = np.array(all_caps) / np.array(n_words, dtype=np.float) bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) return np.array([n_words, n_chars, n_dwords, n_you_re, exclamation, all_caps, addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos]).T
def transform(self, documents): import enchant import re import sentlex from pattern.en import tag as tagger d = enchant.Dict("en_US") SWN = sentlex.SWN3Lexicon() from feature_extraction import tokenize_document tokenized_documents = [ tokenize_document(document) for document in documents ] n_words = [len(c.split()) for c in documents] #n_words = [len(document) for document in tokenized_documents] n_chars = [len(c) for c in documents] n_dwords = [ sum(1 for word in document if d.check(word)) for document in tokenized_documents ] sent_pos = [] sent_neg = [] for comment in documents: count = 0. pos_sum = 0. neg_sum = 0. for word, tag in tagger(comment.lower()): if word == 'fakeinsult': pos_sum += 0. neg_sum += 1. count += 1. continue if tag.startswith('RB'): sentiment = SWN.getadverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. elif tag.startswith('NN'): sentiment = SWN.getnoun(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('JJ'): sentiment = SWN.getadjective(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('VB'): sentiment = SWN.getverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if count != 0: pos_sum /= count neg_sum /= count sent_neg.append(neg_sum) sent_pos.append(pos_sum) n_you_re = [ len(re.findall(self.__you_re, document)) for document in documents ] n_you = [ len(re.findall(self.__you, document)) for document in documents ] # number of uppercase words allcaps = [ np.sum([w.isupper() for w in comment.split()]) for comment in documents ] # longest word #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents] # average word length #mean_word_len = [np.mean([len(w) for w in c.split()]) # for c in documents] # number badwords: n_bad = [ np.sum([c.lower().count(w) for w in self.__badwords]) for c in documents ] exclamation = [c.count("!") for c in documents] addressing = [c.count("@") for c in documents] allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float) bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) return np.array([ n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps, addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos ]).T
def transform(self, documents): import enchant import re import sentlex from pattern.en import tag as tagger d = enchant.Dict("en_US") SWN = sentlex.SWN3Lexicon() from feature_extraction import tokenize_document tokenized_documents = [tokenize_document(document) for document in documents] n_words = [len(c.split()) for c in documents] #n_words = [len(document) for document in tokenized_documents] n_chars = [len(c) for c in documents] n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents] sent_pos = [] sent_neg = [] for comment in documents: count = 0. pos_sum = 0. neg_sum = 0. for word, tag in tagger(comment.lower()): if word == 'fakeinsult': pos_sum += 0. neg_sum += 1. count += 1. continue if tag.startswith('RB'): sentiment = SWN.getadverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. elif tag.startswith('NN'): sentiment = SWN.getnoun(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('JJ'): sentiment = SWN.getadjective(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if tag.startswith('VB'): sentiment = SWN.getverb(word) pos_sum += sentiment[0] neg_sum += sentiment[1] count += 1. if count != 0: pos_sum /= count neg_sum /= count sent_neg.append(neg_sum) sent_pos.append(pos_sum) n_you_re = [len(re.findall(self.__you_re, document)) for document in documents] n_you = [len(re.findall(self.__you, document)) for document in documents] # number of uppercase words allcaps = [np.sum([w.isupper() for w in comment.split()]) for comment in documents] # longest word #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents] # average word length #mean_word_len = [np.mean([len(w) for w in c.split()]) # for c in documents] # number badwords: n_bad = [np.sum([c.lower().count(w) for w in self.__badwords]) for c in documents] exclamation = [c.count("!") for c in documents] addressing = [c.count("@") for c in documents] allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float) bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) return np.array([n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps, addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos]).T
model = language_model(comments) print "\nSamples: " import pprint printer = pprint.PrettyPrinter(indent=4) printer.pprint(model["sound"].samples()) print "\n" model = word2vec_model(comments) print model.similarity('retarded', 'loser') if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'): from train import train_word2vec from train import w2vectorize from feature_extraction import tokenize_document model, classifier = train_word2vec(categories, comments, 500) test_documents = [tokenize_document(document, stopwords='english') for document in test_comments] test_vecs = w2vectorize(test_documents, model, 500) predictions = classifier.predict(test_vecs) print "\nWord2Vec Model Result\n" prediction_info(predictions, test_categories) if config_parser.getboolean(EXECUTION_SECTION, 'Final'): from train import train_assembling classifier = train_assembling(categories, comments, badwords) predictions = classifier.predict(test_comments) print "\nFeature Ensemble Model Result\n" prediction_info(predictions, test_categories)