class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) @attr("skip") # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.") def test_sent_tokenize(self): tokens = sent_tokenize(self.text) assert_true(is_generator(tokens)) # It's a generator assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) @attr("skip") # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.") def test_sent_tokenize(self): tokens = sent_tokenize(self.text) assert_true(is_generator(tokens)) # It's a generator assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def extract_global_bag_of_words_processed(df_comments): corpus = [] i = 0 lemmatizer = WordNetLemmatizer() tb = Blobber(pos_tagger=PerceptronTagger()) sentencer = SentenceTokenizer() for _,row in df_comments.iterrows(): comm = row['comment_content'] tokens = [] for sent in sentencer.tokenize(comm.decode('ascii','ignore')): tagged = tb(sent.lower()).tags # Remove stops filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')] # Remove punctuation filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0] # Lemmatize filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words] filtered_words = [w for w in filtered_words if len(w) > 1] for word in filtered_words: tokens.append(word) corpus.append(' '.join(tokens)) i += 1 if i % 1000 == 0: print i, "words processed for Ngrams" return corpus
class LexicalBigramUnigramAnalyzer(object): def __init__(self): self.lemmatizer = WordNetLemmatizer() self.tb = Blobber(pos_tagger=PerceptronTagger()) self.sentencer = SentenceTokenizer() def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')): tagged = self.tb(sent.lower()).tags tagged = [(t[0], penn_to_wn(t[1])) for t in tagged] tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')] ng = zip(tagged, tagged[1:]) rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN] rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)] rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB] rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN] filtered_list = rule1 + rule2 + rule3 + rule4 # Lemmatize filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list] filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged] for bigram in filtered_bigrams: tokens.append(bigram) for unigram in filtered_unigrams: tokens.append(unigram) return tokens
def comment_to_sentences(comment, remove_stops=False): sentencer = SentenceTokenizer() corpus = [] for sent in sentencer.tokenize(comment): if len(sent) > 0: corpus.append(comment_to_wordlist(sent, remove_stops)) return corpus
def comment_to_sentences(comment, remove_stops=False): sentencer = SentenceTokenizer(); corpus = [] for sent in sentencer.tokenize(comment): if len(sent) > 0 : corpus.append(comment_to_wordlist(sent, remove_stops)) return corpus
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ "Beautiful is better than ugly.", "Simple is better than complex." ]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")
class CharacterAnalyzer(object): def __init__(self): self.sentencer = SentenceTokenizer() self.max = 8 self.min = 2 def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) for n in range(self.min,self.max+1): ngr = [words[i:i+n] for i in range(len(words)-n+1)] if len(ngr) > 0: tokens += ngr return tokens
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")
class CharacterSkipGramAnalyzer(object): def __init__(self): self.sentencer = SentenceTokenizer() self.worder = WordTokenizer(); def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = self.worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) return tokens
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer() sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0, len(word)): term = word[:j] + word[j + 1:] tokens.append(term.strip()) print tokens
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer(); sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) print tokens
def extract_feature_matrix(df_comments, df_thread_groupby): print "START" # Sentence Tokenizer sentencer = SentenceTokenizer() clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle') featureMatrix = np.empty([df_comments.shape[0],25]) feature_dict = dict() for ix, row in df_comments.iterrows(): feature_dict[row['comment_id']] = ix feature_count = 0 for _,row in df_comments.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ASCII', 'ignore') tokens = words(comm) unique_tokens = set(tokens) sentences = sentencer.tokenize(comm) featureMatrix[index][3] = len(comm) verb_fr, noun_fr, pronoun_fr = pos_freq(tokens) featureMatrix[index][4] = verb_fr featureMatrix[index][5] = noun_fr featureMatrix[index][6] = pronoun_fr featureMatrix[index][7] = capital_frequency(tokens) featureMatrix[index][8] = sent_frequency(sentences, '?') featureMatrix[index][9] = sent_frequency(sentences, '!') featureMatrix[index][10] = sentence_capital_frequency(sentences) featureMatrix[index][11] = entropy(comm) featureMatrix[index][12] = lexical_diversity(tokens) if len(tokens) == 0: featureMatrix[index][13] = 0 featureMatrix[index][14] = 0 featureMatrix[index][15] = 0 featureMatrix[index][16] = 0 else: spelt_wrong = missing_words(unique_tokens) bad_words_list = swears(unique_tokens) featureMatrix[index][13] = len(spelt_wrong) featureMatrix[index][14] = len(spelt_wrong)/float(len(unique_tokens)) featureMatrix[index][15] = len(bad_words_list) featureMatrix[index][16] = len(bad_words_list)/float(len(unique_tokens)) featureMatrix[index][19] = F_K_score(sentences, tokens) testSet = dict() refWords = make_full_dict(tokens) testSet.update(refWords) probDist = clf.prob_classify(testSet) sentiment = probDist.prob('pos') subj_obj = get_subjectivity(probDist) polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf) featureMatrix[index][22] = sentiment featureMatrix[index][23] = subj_obj featureMatrix[index][24] = polarity_overlap feature_count += 1 if feature_count % 1000 == 0: print feature_count print "DONE" feature_count = 0 # Grouped for _,group in df_thread_groupby: thread_comments = [row['comment_content'] for _,row in group.iterrows()] # Get average time sumTime = 0 count = 0 previous = mktime(group.iloc[0]['date']) first = mktime(group.iloc[0]['date']) # Average length sumLen = 0 thread_tokens = [] # Within Thread for _, row in group.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ascii','ignore') tokens = words(comm) sentences = sentencer.tokenize(comm) # Ongoing average time sumTime += mktime(row['date']) - previous count += 1 avgTime = sumTime/float(count) # Ongoing average length sumLen += len(words(row['comment_content'])) avgLen = sumLen/float(count) ###################################################################### # Get chunked sentences for sent in sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] # The cumulative tokens up to this point thread_tokens += doc ###################################################################### article_tokens = [] article_sentences = sentencer.tokenize(row['article_body']) for sent in article_sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] ###################################################################### featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1)) previous = mktime(row['date']) featureMatrix[index][1] = mktime(row['date']) - first featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1)) featureMatrix[index][17] = np.mean([termf(comm.count(w), tokens) for w in set(tokens)]) featureMatrix[index][18] = tf_idf(comm, thread_comments) featureMatrix[index][20] = onSubForumTopic(tokens, thread_tokens) featureMatrix[index][21] = onSubForumTopic(tokens, article_tokens) feature_count += 1 if feature_count % 1000 == 0: print feature_count return featureMatrix