Beispiel #1
0
def data_preparation_train_test():
    """Generación del conjunto de entrenamiento y test
    
    Returns:
        train_sents: es una lista con las oraciones (listas de palabras) para el entrenamiento.
        test_sents: es una lista con las oraciones (listas de palabras) para el test.
        train_labels: lista de enteros con la clase (0:negativo, 1:positivo) de 
        las oraciones de entrenamiento.
        test_labels: lista de enteros con la clase (0:negativo, 1:positivo) de 
        las oraciones de test.
    """
    #Esto lo hago porque a travé de la documentación de NLTK sé como es el corpus.
    positive_sents = sent_pol.sents(categories="pos")
    n_pos_sents = len(positive_sents)
    negative_sents = sent_pol.sents(categories="neg")
    n_neg_sents = len(negative_sents)
    #db_indexes: Cada posición se corresponde con una oración del corpus
    db_indexes = [i for i in range(n_pos_sents + n_neg_sents)]
    db_sents = positive_sents + negative_sents
    #db_labels: Cada posición se corresponde con una etiqueta de opinión del corpus.
    #Cada posición de esta lista se corresponde con cada posición de db_indexes.
    db_labels = [1] * n_pos_sents + [0] * n_neg_sents
    train_indexes, test_indexes, train_labels, test_labels = train_test_split(
        db_indexes, db_labels, test_size=0.2, shuffle=True, stratify=db_labels)

    train_sents = [db_sents[i] for i in train_indexes]
    test_sents = [db_sents[i] for i in test_indexes]

    return (train_sents, test_sents, train_labels, test_labels)
Beispiel #2
0
def getSentPolarities():
    p = sentence_polarity.sents(categories='pos')
    n = sentence_polarity.sents(categories='neg')
    neg_sents = [(extractWords(sentence), 'neg') for sentence in n]
    pos_sents = [(extractWords(sentence), 'pos') for sentence in p]

    return (neg_sents, pos_sents)
    def __init__(self):
        """
        constructor
        """
        self.positive_sentences = []
        self.negative_sentences = []

        response1 = input(
            'Would you want to test sentiment with a local text data? (Y/N) ')
        if response1.lower() == 'y' or response1.lower() == 'yes':
            positive_file = input(
                'Input the path for the positive sentiment data: ')
            negative_file = input(
                'Input the path for the negative sentiment data: ')
            if os.path.exists(positive_file):
                # read positive sentences
                with open(positive_file, "r") as reader:
                    self.positive_sentences = reader.readlines()
                self.positive_sentences = [
                    sent.rstrip() for sent in self.positive_sentences
                ]
            if os.path.exists(negative_file):
                # read negative sentences
                with open(negative_file, "r") as reader:
                    self.negative_sentences = reader.readlines()
                self.negative_sentences = [
                    sent.rstrip() for sent in self.negative_sentences
                ]
        else:
            # use 5331 positive sentences and 5331 negative sentences as testing data
            # since this requires a huge amount of lexica, so this part is not implemented
            response2 = input(
                'Would you want to test sentiment with data in sentence_polarity? (Y/N) '
            )
            if response2.lower() == 'y' or response2.lower() == 'yes':
                # negative words
                self.negative_lexica = opinion_lexicon.negative()
                self.negative_lexica_size = len(self.negative_lexica)
                # positive words
                self.positive_lexica = opinion_lexicon.positive()
                self.positive_lexica_size = len(self.positive_lexica)

                # sentence sentiment categories
                self.senti_categories = sentence_polarity.categories()
                # negative sentiment sentences
                self.negative_sentences = sentence_polarity.sents(
                    categories=['neg'])[:10]  # get the first 10 sentences
                self.negative_sentences = [
                    ' '.join(sent) for sent in self.negative_sentences
                ]
                self.negative_sentences_size = len(self.negative_sentences)
                # positive sentiment sentences
                self.positive_sentences = sentence_polarity.sents(
                    categories=['pos'])[:10]  # get the first 10 sentences
                self.positive_sentences = [
                    ' '.join(sent) for sent in self.positive_sentences
                ]
                self.positive_sentences_size = len(self.positive_sentences)
Beispiel #4
0
def load_sentence_polarity():
    from nltk.corpus import sentence_polarity

    vocab = Vocab.build(sentence_polarity.sents())

    train_data = [(vocab.convert_tokens_to_idx(sentence), 0) for sentence in
                  sentence_polarity.sents(categories="pos")[:4000]] \
                 + [(vocab.convert_tokens_to_idx(sentence), 1) for sentence in
                    sentence_polarity.sents(categories="neg")[:4000]]

    test_data = [(vocab.convert_tokens_to_idx(sentence), 0) for sentence in
                 sentence_polarity.sents(categories="pos")[4000:]] \
                + [(vocab.convert_tokens_to_idx(sentence), 1) for sentence in
                   sentence_polarity.sents(categories="neg")[4000:]]

    return train_data, test_data, vocab
    t = [
        lemmatizer.lemmatize(w[0], get_pos(w[1])) for w in pos_tags
        if len(w[0]) > 1
    ]
    # Combine them into a string
    #t = ' '.join(t)
    return t


# print(dataset[0][0])
# print(preprocess(dataset[0][0]))

# In[5]:

# Get the sentence corpus and look at some sentences
sentences = sentence_polarity.sents()
documents = []
for cat in sentence_polarity.categories():
    for sent in sentence_polarity.sents(categories=cat):
        documents.append((preprocess(' '.join(sent)), cat))
# print(documents)
# documents = [(sent,cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)
all_words_list = [word for (sent, cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
# # get the 200 most frequently appearing keywords in the corpus
word_items = all_words.most_common(200)
# print(word_items)
word_features = [word for (word, count) in word_items]
# print(word_features)
Beispiel #6
0
     no_punc = []
     for review in tokenized:
         line = "".join(char for char in review if char not in string.punctuation)
         no_punc.append(line)
     tokens = lemmatize(no_punc)
     return tokens

 def lemmatize(tokens):
     lmtzr = WordNetLemmatizer()
     lemma = [lmtzr.lemmatize(t) for t in tokens]
     return lemma

 reviews = reviews.apply(lambda x: tokenize(x))
 from nltk.corpus import sentence_polarity
 import random
 sentences = sentence_polarity.sents()
 documents = [(sent, reviews) for reviews in sentence_polarity.categories()
      for sent in sentence_polarity.sents(categories=reviews)]
 random.shuffle(documents)
 all_words_list = [word for (sent,reviews) in documents for word in sent]
 all_words = nltk.FreqDist(all_words_list)
 import nltk
 all_words = nltk.FreqDist(all_words_list)
 word_items = all_words.most_common(100)
 word_features = [word for (word, freq) in word_items]
 def document_features(document, word_features):
      document_words = set(document)
      features = {}
      for word in word_features:
          features['contains({})'.format(word)] = (word in document_words)
      return features
Beispiel #7
0
# Author: Joyce Woznica
# Lab - Week 8
#
import nltk

# get movie review
from nltk.corpus import sentence_polarity
import random

# get the sentence corpus and look at some sentences
# had to download nltk sentence_polarity
nltk.download('sentence_polarity')

sentences = sentence_polarity.sents()
print(len(sentences))
print(type(sentences))
print(sentence_polarity.categories())
# sentences are already tokenized, print the first four sentences
for sent in sentences[:4]:
    print(sent)

# look at the sentences by category to see how many positive and negative
pos_sents = sentence_polarity.sents(categories='pos')
print(len(pos_sents))
neg_sents = sentence_polarity.sents(categories='neg')
print(len(neg_sents))

## setup the movie reviews sentences for classification
# create a list of documents, each document is one sentence as a list of words paired with category
documents = [(sent, cat) for cat in sentence_polarity.categories()
             for sent in sentence_polarity.sents(categories=cat)]
Beispiel #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import sentence_polarity

print(sentence_polarity.sents())
print(sentence_polarity.categories())
print(sentence_polarity.sents()[1])
    print(len(opinion_lexicon.words()))
    ## negative lexicon
    print('negative lexicon:')
    print(opinion_lexicon.negative()[:4])
    print(len(opinion_lexicon.negative()))
    ## positive lexicon
    print('positive lexicon:')
    print(opinion_lexicon.positive()[:4])
    print(len(opinion_lexicon.positive()))
    print()

    print('-------------------------------------------------------')

    # sentence polarity
    print('all sentences:')
    print(sentence_polarity.sents())
    print(len(sentence_polarity.sents()))

    print('sentence categories:')
    print(sentence_polarity.categories())

    print('examples:')
    print(sentence_polarity.sents()[0])
    print(sentence_polarity.sents()[10661])
    print(len(sentence_polarity.sents()))
    print()

    print('-------------------------------------------------------')

    # negative sentences in sentence_polarity
    print('negative sentences:')
Beispiel #10
0
Script makes use of :class:`grakel.WeisfeilerLehman`, :class:`grakel.VertexHistogram`
"""
from __future__ import print_function

print(__doc__)

import numpy as np
import time

from nltk import word_tokenize
from nltk.corpus import sentence_polarity

from grakel.kernels import WeisfeilerLehman, VertexHistogram
from grakel import Graph

sents = sentence_polarity.sents()
sents = [sent for sent in sents if len(sent) > 1]
n_sents = 3000
sents = sents[:n_sents]
print("Loaded %d sentences\n" % n_sents)

print("Creating word co-occurrence networks\n")
word_networks = list()
for sent in sents:

    node_labels = dict()
    tokens_to_ids = dict()
    for token in sent:
        if token not in tokens_to_ids:
            tokens_to_ids[token] = len(tokens_to_ids)
            node_labels[tokens_to_ids[token]] = token
Beispiel #11
0
printmeasures('neg', refneg, testneg)
printmeasures('pos', refpos, testpos)

# Using movie review corpus and top words from amazon baby review
all_sent = nltk.word_tokenize(st)
all_sent_list = [nltk.word_tokenize(sent) for sent in all_sent]
words = [w.lower() for w in all_sent if w.isalpha()]
stopwords = nltk.corpus.stopwords.words('english')
all_words_list = [word for word in words if word not in stopwords]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]


      
sentences = sentence_polarity.sents()
documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]

random.shuffle(documents)

featuresets = [(document_features(d,word_features), c) for (d,c) in documents]
traning = int(len(featuresets)*0.5)
train_set, test_set = featuresets[traning:], featuresets[:traning]
classifier3 = nltk.NaiveBayesClassifier.train(train_set)
print ('Testing Accuracy: ',nltk.classify.accuracy(classifier3, test_set)) # 0.63421// 0.6450

      
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)