Esempio n. 1
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Esempio n. 2
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Esempio n. 3
0
def extract_global_bag_of_words_processed(df_comments):
    corpus = []   
    i = 0
    lemmatizer = WordNetLemmatizer()    
    tb = Blobber(pos_tagger=PerceptronTagger())
    sentencer = SentenceTokenizer()
    for _,row in df_comments.iterrows():  
        comm = row['comment_content']
        tokens = []   
        for sent in sentencer.tokenize(comm.decode('ascii','ignore')):
            tagged = tb(sent.lower()).tags    
            # Remove stops
            filtered_words = [w for w in tagged if not w[0] in stopwords.words('english')]
                   
            # Remove punctuation
            filtered_words = [(re.findall('[a-z]+', w[0].lower())[0], w[1]) for w in filtered_words if len(re.findall('[a-z]+', w[0].lower())) > 0]             
                    
            # Lemmatize
            filtered_words = [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in filtered_words]  
            
            filtered_words = [w for w in filtered_words if len(w) > 1]
            
            for word in filtered_words:
                tokens.append(word)  
        corpus.append(' '.join(tokens))
        i += 1
        if i % 1000 == 0:
            print i, "words processed for Ngrams"
                
            
    return corpus
Esempio n. 4
0
class LexicalBigramUnigramAnalyzer(object):   
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()    
        self.tb = Blobber(pos_tagger=PerceptronTagger())
        self.sentencer = SentenceTokenizer()
    def __call__(self, doc):   
        tokens = []     
        for sent in self.sentencer.tokenize(doc.decode('ascii','ignore')):
            tagged = self.tb(sent.lower()).tags    
            
            tagged = [(t[0], penn_to_wn(t[1])) for t in tagged]
            tagged = [(t[0], t[1]) for t in tagged if t[0] not in stopwords.words('english')]
            ng = zip(tagged, tagged[1:])
            rule1 = [(t[0],t[1]) for t in ng if t[0][1]== wn.ADJ and t[1][1]== wn.NOUN]
            rule2 = [(t[0],t[1]) for t in ng if (t[0][1]== wn.ADV and t[1][1]== wn.VERB) or (t[0][1]== wn.VERB and t[1][1]== wn.ADV)]
            rule3 = [(t[0],t[1]) for t in ng if t[0][1]== wn.VERB and t[1][1]== wn.VERB]
            rule4 = [(t[0],t[1]) for t in ng if t[0][1]== wn.NOUN and t[1][1]== wn.NOUN]
            
            filtered_list = rule1 + rule2 + rule3 + rule4
                             
                    
            # Lemmatize
            filtered_bigrams = [self.lemmatizer.lemmatize(t[0][0], t[0][1]) + ' ' + self.lemmatizer.lemmatize(t[1][0], t[1][1]) for t in filtered_list]
            filtered_unigrams = [self.lemmatizer.lemmatize(w[0], w[1]) for w in tagged]
            for bigram in filtered_bigrams:
                tokens.append(bigram)
            for unigram in filtered_unigrams:
                tokens.append(unigram)
        return tokens
def comment_to_sentences(comment, remove_stops=False):
    sentencer = SentenceTokenizer()

    corpus = []
    for sent in sentencer.tokenize(comment):
        if len(sent) > 0:
            corpus.append(comment_to_wordlist(sent, remove_stops))

    return corpus
def comment_to_sentences(comment, remove_stops=False):
    sentencer = SentenceTokenizer();
    
    corpus = []
    for sent in sentencer.tokenize(comment):  
        if len(sent) > 0 :  
            corpus.append(comment_to_wordlist(sent, remove_stops))
    
    return corpus
Esempio n. 7
0
class TestSentenceTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            "Beautiful is better than ugly.", "Simple is better than complex."
        ])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
                     ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")
Esempio n. 8
0
class CharacterAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.max = 8
        self.min = 2
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            for n in range(self.min,self.max+1):
                ngr = [words[i:i+n] for i in range(len(words)-n+1)]
                if len(ngr) > 0:
                    tokens += ngr
        return tokens
Esempio n. 9
0
class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")
Esempio n. 10
0
class CharacterSkipGramAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.worder = WordTokenizer();
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            words = self.worder.tokenize(words)
            
            for word in words:
                tokens.append(word.strip())
                if len(word) > 2:
                    for j in range(0,len(word)):    
                        term = word[:j] + word[j+1:] 
                        tokens.append(term.strip())
        return tokens
Esempio n. 11
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer

sentencer = SentenceTokenizer()
worder = WordTokenizer()

sentences = ['How are you? I am fine!']

tokens = []
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)

    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0, len(word)):
                term = word[:j] + word[j + 1:]
                tokens.append(term.strip())

print tokens
Esempio n. 12
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer


sentencer = SentenceTokenizer()
worder = WordTokenizer();

sentences = ['How are you? I am fine!']

tokens = []      
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)
    
    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0,len(word)):    
                term = word[:j] + word[j+1:] 
                tokens.append(term.strip())

print tokens
Esempio n. 13
0
def extract_feature_matrix(df_comments, df_thread_groupby):
    print "START"
    # Sentence Tokenizer
    sentencer = SentenceTokenizer()
    
    clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle')
        
    featureMatrix = np.empty([df_comments.shape[0],25])
    
    feature_dict = dict()
    for ix, row in df_comments.iterrows():
        feature_dict[row['comment_id']] = ix
    
    feature_count = 0
    
    for _,row in df_comments.iterrows():
        index = feature_dict[row['comment_id']]
        
        comm = row['comment_content'].decode('ASCII', 'ignore')
        tokens = words(comm)
        unique_tokens = set(tokens)
        sentences = sentencer.tokenize(comm)
        
        featureMatrix[index][3] =  len(comm)
        
        verb_fr, noun_fr, pronoun_fr = pos_freq(tokens)
        featureMatrix[index][4] = verb_fr
        featureMatrix[index][5] = noun_fr
        featureMatrix[index][6] = pronoun_fr
        
        featureMatrix[index][7] = capital_frequency(tokens)
        featureMatrix[index][8] = sent_frequency(sentences, '?')
        featureMatrix[index][9] = sent_frequency(sentences, '!')
        featureMatrix[index][10] = sentence_capital_frequency(sentences)
        
        featureMatrix[index][11] = entropy(comm)
        featureMatrix[index][12] = lexical_diversity(tokens)
        
        
        if len(tokens) == 0:
            featureMatrix[index][13] =  0
            featureMatrix[index][14] =  0
            featureMatrix[index][15] =  0
            featureMatrix[index][16] =  0
        else:
            spelt_wrong = missing_words(unique_tokens)
            bad_words_list = swears(unique_tokens)
            
            featureMatrix[index][13] =  len(spelt_wrong)
            featureMatrix[index][14] =  len(spelt_wrong)/float(len(unique_tokens))
            featureMatrix[index][15] =  len(bad_words_list)
            featureMatrix[index][16] =  len(bad_words_list)/float(len(unique_tokens))
            
            
        featureMatrix[index][19] =  F_K_score(sentences, tokens)
        
        testSet = dict()
        refWords = make_full_dict(tokens)
        testSet.update(refWords)
    
        probDist = clf.prob_classify(testSet)                
        sentiment = probDist.prob('pos')            
        subj_obj = get_subjectivity(probDist)
    
        polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf)
        featureMatrix[index][22] =  sentiment
        featureMatrix[index][23] =  subj_obj
        featureMatrix[index][24] =  polarity_overlap
        
        feature_count += 1
        if feature_count % 1000 == 0:
            print feature_count
    
    print "DONE"
    
    feature_count = 0
    # Grouped
    for _,group in df_thread_groupby:
        thread_comments = [row['comment_content'] for _,row in group.iterrows()]
        
        # Get average time
        sumTime = 0 
        count = 0                
        previous = mktime(group.iloc[0]['date'])
        first = mktime(group.iloc[0]['date'])
        
        # Average length
        sumLen = 0 
        
        
        thread_tokens = []    
        
        # Within Thread
        for _, row in group.iterrows():
            index = feature_dict[row['comment_id']]
            comm = row['comment_content'].decode('ascii','ignore')
            tokens = words(comm)
            sentences = sentencer.tokenize(comm)
            
            # Ongoing average time
            sumTime += mktime(row['date']) - previous
            count += 1            
            avgTime = sumTime/float(count)
            
            # Ongoing average length
            sumLen += len(words(row['comment_content']))
            avgLen = sumLen/float(count)
            
            ######################################################################
            # Get chunked sentences
            for sent in sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = [] 
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
                
                # The cumulative tokens up to this point
                thread_tokens += doc
            
            ######################################################################
            article_tokens = []
            article_sentences = sentencer.tokenize(row['article_body'])
            for sent in article_sentences:
                sent_tokens = words(sent)
                sent_tokens_tagged = nltk.pos_tag(sent_tokens)
                chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True)
                doc = []
                for chunk in chunks:
                    if type(chunk) == nltk.Tree:
                        doc.append(' '.join(c[0] for c in chunk.leaves()))
                    else:
                        doc.append(chunk[0])
                article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1]
            
            ######################################################################
            
            
            featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1))
            previous = mktime(row['date'])        
            
            featureMatrix[index][1] =  mktime(row['date']) - first  
            
            featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1))  
            
            featureMatrix[index][17] =  np.mean([termf(comm.count(w), tokens) for w in set(tokens)])  
            featureMatrix[index][18] =  tf_idf(comm, thread_comments)     
            
            featureMatrix[index][20] =  onSubForumTopic(tokens, thread_tokens)
            featureMatrix[index][21] =  onSubForumTopic(tokens, article_tokens)
    
    
            feature_count += 1
            if feature_count % 1000 == 0:
                print feature_count
    
    return featureMatrix