Exemple #1
0
def get_formatted_train_test_data(category, feature_extractor=None, split=0.7):
    '''
    Helper function. Splits data evenly across positive and negative, and then formats it
    ready for naive bayes. You can also optionally pass in your custom feature extractor 
    (see next section), and a custom split ratio.
    '''
    arcr = AmazonReviewCorpusReader()
    pos_train, pos_test = split_data(arcr.positive().category(category).documents(), split)
    neg_train, neg_test = split_data(arcr.negative().category(category).documents(), split)
    train = format_data(pos_train, "pos", feature_extractor) + format_data(neg_train, "neg", feature_extractor)
    test  = format_data(pos_test, "pos", feature_extractor) + format_data(neg_test, "neg", feature_extractor)
    return test, train
def get_training_test_data(cat,ratio=0.7):
    """
    Given a category of review and a ratio, make appropriate partitions of positive and negative reviews from that category
    :param cat: A String specifying the category of review e.g., "dvd"
    :param ratio: A float specifying the proportion of training documents, default value = 0.7
    :return: A pair of lists where first element is the training set and second element is the testing set
    """
    reader=AmazonReviewCorpusReader().category(cat)
    pos_train, pos_test = split_data(reader.positive().documents(),ratio=ratio)
    neg_train, neg_test = split_data(reader.negative().documents(),ratio=ratio)
    train_data=[(review,'P') for review in pos_train]+[(review,'N') for review in neg_train]
    test_data=[(review,'P') for review in pos_test]+[(review,'N') for review in neg_test]
    return train_data,test_data
Exemple #3
0
 def __init__(self, domein):
     
     #path for saving of files
     self.path = "N:\\NLP_experiments\\"
     
     #category for main experiments
     self.Main_reader = AmazonReviewCorpusReader().category(domein)
     self.domein = domein
     
     #set of readers for cross domain validation
     self.KitchenCorpusReader = AmazonReviewCorpusReader().category("kitchen")
     self.BookCorpusReader = AmazonReviewCorpusReader().category("book")
     self.DvdCorpusReader = AmazonReviewCorpusReader().category("dvd")
     self.ElectronicsCorpusReader = AmazonReviewCorpusReader().category("electronics")
     
     #dictionarie of corpuses
     self.CorpusReaders = dict([('kitchen', self.KitchenCorpusReader), ('book', self.BookCorpusReader), ('dvd', self.DvdCorpusReader), ('electronics', self.ElectronicsCorpusReader)])
Exemple #4
0
    :param data: A corpus generator.
    :param ratio: The proportion of training documents (default 0.7)
    :return: a pair (tuple) of lists where the first element of the 
            pair is a list of the training data and the second is a list of the test data.
    """

    data = list(
        data
    )  # data is a generator, so this puts all the generated items in a list

    n = len(data)  #Found out number of samples present
    train_indices = sample(range(n),
                           int(n * ratio))  #Randomly select training indices
    test_indices = list(set(range(n)) -
                        set(train_indices))  #Randomly select testing indices

    train = [data[i]
             for i in train_indices]  #Use training indices to select data
    test = [data[i] for i in test_indices]  #Use testing indices to select data

    return (train, test)  #Return split data


#Create an Amazon corpus reader pointing at only book reviews
book_reader = AmazonReviewCorpusReader().category("book")

for ratio in [0.1, 0.5, 0.9]:
    pos_train, pos_test = split_data(book_reader.positive().documents(), ratio)
    neg_train, neg_test = split_data(book_reader.negative().documents(), ratio)
    print(ratio, len(pos_train), len(pos_test))
    n = len(data)  #Found out number of samples present
    train_indices = sample(range(n),
                           int(n * ratio))  #Randomly select training indices
    test_indices = list(set(range(n)) -
                        set(train_indices))  #Other items are testing indices

    train = [data[i]
             for i in train_indices]  #Use training indices to select data
    test = [data[i] for i in test_indices]  #Use testing indices to select data

    return (train, test)  #Return split data


#Create an Amazon corpus reader pointing at only dvd reviews
dvd_reader = AmazonReviewCorpusReader().category("dvd")

#The following two lines use the documents function on the Amazon corpus reader.
#This returns a generator over reviews in the corpus.
#Each review is an instance of a Python class called AmazonReview.
#An AmazonReview object contains all the data about a review.
pos_train, pos_test = split_data(dvd_reader.positive().documents())
neg_train, neg_test = split_data(dvd_reader.negative().documents())

#You can also combine the training data
train = pos_train + neg_train


def get_all_words(amazon_reviews):
    return reduce(lambda words, review: words + review.words(), amazon_reviews,
                  [])
Exemple #6
0
class RunExperiment(object):
    def __init__(self, domein):
        
        #path for saving of files
        self.path = "N:\\NLP_experiments\\"
        
        #category for main experiments
        self.Main_reader = AmazonReviewCorpusReader().category(domein)
        self.domein = domein
        
        #set of readers for cross domain validation
        self.KitchenCorpusReader = AmazonReviewCorpusReader().category("kitchen")
        self.BookCorpusReader = AmazonReviewCorpusReader().category("book")
        self.DvdCorpusReader = AmazonReviewCorpusReader().category("dvd")
        self.ElectronicsCorpusReader = AmazonReviewCorpusReader().category("electronics")
        
        #dictionarie of corpuses
        self.CorpusReaders = dict([('kitchen', self.KitchenCorpusReader), ('book', self.BookCorpusReader), ('dvd', self.DvdCorpusReader), ('electronics', self.ElectronicsCorpusReader)])
               
                        
    def RunWordList(self, number_of_words, type_of_Feature_extractor = 0, type_of_extraction = 0):
        #simple classifier
        #type_of_extraction - 0 for Extraxt_n_most_Freq_Words, 1 for Extraxt_words_above_count
                    
        #extract n most Frequent words
        if (type_of_extraction == 0):
            #creating a class for list of words extraction
            FrWords = ExtractorOfWords.ExtractorOfWords(self.pos_training_data, self.neg_training_data, type_of_Feature_extractor)           
            pos_w_list, neg_w_list = FrWords.Extraxt_n_most_Freq_Words(number_of_words)                   
        #extract all words above count     
        elif (type_of_extraction == 1):
            #creating a class for list of words extraction
            FrWords = ExtractorOfWords.ExtractorOfWords(self.pos_training_data, self.neg_training_data, type_of_Feature_extractor)
            pos_w_list, neg_w_list = FrWords.Extraxt_words_above_count(number_of_words)
        #own list of words
        else:
            pos_w_list = ["nice","cool","awesome","superb","effective", "inspiring", "clever", "positive"]
            neg_w_list = ["bad","expensive", "disappointed","terrible", "pathetic", "ridiculous", "silly","disappointment" ]
            
        Classifier = SimpleClassifier.SimpleClassifier(pos_w_list, neg_w_list)
         
        return evaluate_wordlist_classifier(Classifier, self.pos_testing_data, self.neg_testing_data)      
      
    def RunBayesNetwork(self, type_of_Feature_extractor):      
        #Bayes Network classifier, return accuracy
        
        if type_of_Feature_extractor == 1:
            #Format the positive and negative separately
            formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor1)  
            formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor1) 
            #Same again but for the testing data
            formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor1) 
            formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor1) 
        elif type_of_Feature_extractor == 2:
            #Format the positive and negative separately
            formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor2)  
            formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor2) 
            #Same again but for the testing data
            formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor2) 
            formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor2)
        elif type_of_Feature_extractor == 3:
            #Format the positive and negative separately
            formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor3)  
            formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor3) 
            #Same again but for the testing data
            formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor3) 
            formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor3) 
        elif type_of_Feature_extractor == 4:
            #Format the positive and negative separately
            formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos", BNFormat.Feature_extractor4)  
            formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg", BNFormat.Feature_extractor4) 
            #Same again but for the testing data
            formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos", BNFormat.Feature_extractor4) 
            formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg", BNFormat.Feature_extractor4) 
        else:
            #Format the positive and negative separately
            formatted_pos_training = BNFormat.format_data(self.pos_training_data, "pos")  
            formatted_neg_training = BNFormat.format_data(self.neg_training_data, "neg") 
            #Same again but for the testing data
            formatted_pos_testing = BNFormat.format_data(self.pos_testing_data, "pos") 
            formatted_neg_testing = BNFormat.format_data(self.neg_testing_data, "neg") 
                    
                                                                  
        #Combine them
        formatted_training_data = formatted_pos_training + formatted_neg_training        
        #Combine them
        formatted_testing_data = formatted_pos_testing + formatted_neg_testing
        
        #Train on a list of reviews
        nb_classifier = NaiveBayesClassifier.train(formatted_training_data)
        
        #Print the features that the NB classifier found to be most important in making classifications
        nb_classifier.show_most_informative_features() 
        
        #Test on another list of reviews
        accuracy_ = accuracy(nb_classifier, formatted_testing_data)
        
        return accuracy_
                           
    def WordList_NumberOfWords_Experiment(self, number_of_iterations, type_of_extraction, type_of_Feature_extractor):
        #set of values for number of words for extraction
        num_of_words = range(10,210,20)
        
        results = numpy.zeros([len(num_of_words), number_of_iterations])
        result_mean = numpy.zeros(len(num_of_words))
        result_std = numpy.zeros(len(num_of_words))
           
        #save experiment result in a file
        file_results = open(self.path +"WordList_NumberOfWords_Experiment"+ strftime("%Y-%m-%d_%H-%M", gmtime()) + '.csv','wb')
        csv_writer = csv.writer(file_results)
        
        for i in range(0, number_of_iterations):
            print "Iteration " + str(i)
            self.Resample()
            for k in range(0, len(num_of_words)):                   
                #extract n most frequent words           
                results[k,i] = self.RunWordList(num_of_words[k], type_of_Feature_extractor, type_of_extraction)
        
        #calculate mean and std
        for j in range(0, len(num_of_words)):  
            result_mean[j] = numpy.mean(results[j,:])
            result_std[j] =  numpy.std(results[j,:]) 
         
        #write set of values for number of words for extraction
        csv_writer.writerow(["Word list experiment. Number of tests for each case: "  + str(number_of_iterations)])
        csv_writer.writerow(num_of_words)
        csv_writer.writerow([" "])
        
        #write mean and std for each number of extracted words
        csv_writer.writerow(result_mean)
        csv_writer.writerow(result_std)
              
        file_results.close()         
            
    def WordList_v_NaiveBayes(self, number_of_iterations):
        
        #arrays for results for all types of tests
        results_BaseLine = numpy.zeros(number_of_iterations)
        results_WordList_Freq = numpy.zeros(number_of_iterations)
        results_WordList_Count = numpy.zeros(number_of_iterations)
        
        mean_results = numpy.zeros(3)
        mean_std = numpy.zeros(3)
        
        #arrays for z-tests of Feature extractors in compare with simple WordList
        Z_WordList_Freq = numpy.zeros(number_of_iterations)
        Z_WordList_Count = numpy.zeros(number_of_iterations)
        
        Z_mean_results = numpy.zeros(3)
        Z_mean_std = numpy.zeros(3)
        
        file_results = open(self.path +"WordList_v_NaiveBayes"+ strftime("%Y-%m-%d_%H-%M", gmtime()) + '.csv','wb')
        csv_writer = csv.writer(file_results)
        
        for i in range(0, number_of_iterations):
            print "Iteration " + str(i)
            #Resample data
            self.Resample()
            #run Naive bayes
            results_BaseLine[i] = self.RunBayesNetwork(0)
            #first n most freq. words
            results_WordList_Freq[i] = self.RunWordList(90, 1, 0)
            #Words with count over n
            results_WordList_Count[i] = self.RunWordList(50, 0, 0)
            
            #z-tests for wordlist approaches
            Z_WordList_Freq[i] =  results_WordList_Freq[i] - results_BaseLine[i]
            Z_WordList_Count[i] =  results_WordList_Count[i] - results_BaseLine[i]
        
        #write mean and std values
        mean_results[0] = numpy.mean(results_BaseLine)
        mean_std[0] = numpy.std(results_BaseLine)   

        mean_results[1] = numpy.mean(results_WordList_Freq)
        mean_std[1] = numpy.std(results_WordList_Freq)  
        
        mean_results[2] = numpy.mean(results_WordList_Count)
        mean_std[2] = numpy.std(results_WordList_Count)
        
        #z-test value
        Z_mean_results[1] = numpy.mean(Z_WordList_Freq)
        Z_mean_std[1] = numpy.std(Z_WordList_Freq)
        
        Z_mean_results[2] = numpy.mean(Z_WordList_Count)
        Z_mean_std[2] = numpy.std(Z_WordList_Count)        
        
        
        #write domain name in file 
        csv_writer.writerow(["Name of domain: " + self.domein, "Number of tests for each case: "  + str(number_of_iterations)])     
        #head for resutls
        csv_writer.writerow(["Naive Bayes","N_Most_Freq","Word_count"]) 
        
        csv_writer.writerow(["Feature extractors"])   
        #mean and std values for feature extractors 
        csv_writer.writerow(mean_results)
        csv_writer.writerow(mean_std)
        
        csv_writer.writerow(["Z-tests"])  
        #mean and std values for z-tests
        csv_writer.writerow(Z_mean_results)
        csv_writer.writerow(Z_mean_std)
                     
        file_results.close()
                     
    #experiment training size
    
    def TraingSetSizeExperiment(self, number_of_iterations, type_of_feature_extractor):
        
        #save experiment result in a file
        file_results = open(self.path + "TrSetExp"+ strftime("%Y-%m-%d_%H-%M", gmtime()) + '.csv','wb')
        csv_writer = csv.writer(file_results)
        
        #number of samples
        number_of_steps = 10
        
        #arrays for results 
        results = numpy.zeros((number_of_steps, number_of_iterations))
        result_mean = numpy.zeros(number_of_steps)
        result_std = numpy.zeros(number_of_steps)
        
        #set of ratio for training sets       
        SRatio = numpy.linspace(0.05, 1, number_of_steps)
        
        #write ratio in a file
        csv_writer.writerow(["Ration of training set"])
        csv_writer.writerow(SRatio)
        
        j = 0
             
        for ratio in SRatio:
            print ratio
            for i in range(0, number_of_iterations):
                #resample with a fixed size of testing set, ratio changes size of trainging set
                self.Resample_mod(ratio)
                #run Bayes Network classifier
                results[j,i] = self.RunBayesNetwork(type_of_feature_extractor)
            #save results for given ratio
            result_mean[j] = numpy.mean(results[j,:])    
            result_std[j] = numpy.std(results[j,:]) 
            j = j + 1
        
        #write results in file
        csv_writer.writerow(["Results mean and std"])
        csv_writer.writerow(result_mean)
        csv_writer.writerow(result_std)
               
        file_results.close()
        self.plot_regression(SRatio*1600, result_mean, "Mean of accuracy")
        self.plot_regression(SRatio*1600, result_std, "Std of accuracy")
        return results, result_mean, result_std
    
    #word list experiment
    
    def FeatureExtractionExperiment(self, number_of_iterations):
        
        #arrays for results for all types of feature extractors
        results_BaseLine = numpy.zeros(number_of_iterations)
        results_FeatureExt = numpy.zeros((number_of_iterations, 5))
        
        mean_results = numpy.zeros(5)
        mean_std = numpy.zeros(5)
        
        #arrays for z-tests of Feature extractors in compare with simple Bayes Network
        Z_results_FExt = numpy.zeros((number_of_iterations,5))
        
        Z_mean_results = numpy.zeros(5)
        Z_mean_std = numpy.zeros(5)
        
        file_results = open(self.path + "BayesFeatureTests"+ strftime("%Y-%m-%d_%H-%M", gmtime()) + '.csv','wb')
        csv_writer = csv.writer(file_results)
        
        #write domain name in file 
        csv_writer.writerow(["Name of domain: " + self.domein, "Number of tests for each case: "  + str(number_of_iterations)])     
        
        #Hypothesis 1, filter improves quality of Bayes Network
        for i in range(0, number_of_iterations):
            print  "Iteration " + str(i)
            #Resample data
            self.Resample()
            #run without feature extraction
            results_BaseLine[i] = self.RunBayesNetwork(0)
            #filter Porter Stemmer          
            #Porter Stemmer
            results_FeatureExt[i, 1] = self.RunBayesNetwork(1)
            #lowercase versions of all the words
            results_FeatureExt[i, 2] = self.RunBayesNetwork(2)
            #Replace all number tokens with "NUM"
            results_FeatureExt[i, 3] = self.RunBayesNetwork(3)
            #filter stop words and isalpha
            results_FeatureExt[i, 4] = self.RunBayesNetwork(4) 
                    
            
            #z-tests for feature extraction
            Z_results_FExt[i, 1] =  results_FeatureExt[i,1] - results_BaseLine[i]
            Z_results_FExt[i, 2] =  results_FeatureExt[i,2] - results_BaseLine[i]
            Z_results_FExt[i, 3] =  results_FeatureExt[i,3] - results_BaseLine[i]
            Z_results_FExt[i, 4] =  results_FeatureExt[i,4] - results_BaseLine[i]
        
        
        
        #write mean and std values
        mean_results[0] = numpy.mean(results_BaseLine)
        mean_std[0] = numpy.std(results_BaseLine)   
          
        for k in range(1,5):
            #feature extractions value
            mean_results[k] = numpy.mean(results_FeatureExt[:,k])
            mean_std[k] = numpy.std(results_FeatureExt[:,k])
            #z-test value
            Z_mean_results[k] = numpy.mean(Z_results_FExt[:,k])
            Z_mean_std[k] = numpy.std(Z_results_FExt[:,k])
        
        #head for resutls
        csv_writer.writerow(["BaseLine","Porter Stemmer","Lowercase","Number replace","Stoplist and lowercase"]) 
        
        csv_writer.writerow(["Feature extractors"])  
         
        #mean and std values for feature extractors 
        csv_writer.writerow(mean_results)
        csv_writer.writerow(mean_std)
        
        csv_writer.writerow(["Z-tests"])  
        
        #mean and std values for z-tests
        csv_writer.writerow(Z_mean_results)
        csv_writer.writerow(Z_mean_std)
                     
        file_results.close()
        print results_BaseLine
        print results_FeatureExt
        
        self.plot_results(mean_results,"Naive Bayes feature extraction. Means", ["BaseLine","Porter Stemmer","Lowercase","Numbers replace","Stoplist"], "Accuracy Means")
        self.plot_results(mean_std,"Naive Bayes feature extraction. Std.", ["BaseLine","Porter Stemmer","Lowercase","Numbers replace","Stoplist"], "Accuracy Std")
    #function for running tests on all feature extractors for Bayes Network
    
    def CrossDomainExperiment(self, number_of_iterations, type_of_feature_extractor):
        
        file_results = open(self.path +"CrossDomainTest" + strftime("%Y-%m-%d_%H-%M", gmtime()) + '.csv','wb')      
        csv_writer = csv.writer(file_results)
        results = numpy.zeros((number_of_iterations))
        mean_results = numpy.zeros(4)
        std_results  = numpy.zeros(4)
        
        #run tests for all domens
        for training_domein in ["book","dvd","kitchen","electronics","all"]:
            csv_writer.writerow(["Training domain:" + training_domein, "Number of tests for each case:: " + str(number_of_iterations)])
            k = 0
            for testing_domein in ["book","dvd","kitchen","electronics"]:                              
                for i in range(0, number_of_iterations):
                    #cross domain resample                    
                    self.Resample_cross_domain(training_domein,testing_domein)
                    #run Bayes Network classifier
                    results[i] = self.RunBayesNetwork(type_of_feature_extractor)
                
                #mean and std for this tests set    
                mean_results[k] = numpy.mean(results)
                std_results[k] = numpy.std(results)
                k = k + 1
                
            #head for resutls
            csv_writer.writerow(["book","dvd","kitchen","electronics"])   
            csv_writer.writerow(mean_results)
            csv_writer.writerow(std_results) 
                                  
        file_results.close()
        
    def Resample(self, ratio = 0.8):
        self.pos_training_data, self.pos_testing_data = Split_data.split_data(self.Main_reader.positive().documents(),ratio)
        self.neg_training_data, self.neg_testing_data = Split_data.split_data(self.Main_reader.negative().documents(),ratio)
        
    def Resample_mod(self, ratio = 0.8):
        #split only test set of data
        self.pos_training_data, self.pos_testing_data = Split_data.split_data_mod(self.Main_reader.positive().documents(),ratio)
        self.neg_training_data, self.neg_testing_data = Split_data.split_data_mod(self.Main_reader.negative().documents(),ratio)
    
    def Resample_cross_domain(self, train_domain,  test_domain, ratio = 0.8):
        
        if train_domain == test_domain:
            #in this case use a simple usual sampling
            self.Resample(ratio)
            
        elif (train_domain == "all"):
            self.pos_training_data = list()
            self.neg_training_data = list()
            #collect data from all domains
            for domain in ["book","dvd","kitchen","electronics"]:
                #get corpus reader from the dictionary of all corpus readers 
                data_extractor = self.CorpusReaders[domain]
                #extract training and testing data for test domain
                if  domain == test_domain:
                    train_pos, self.pos_testing_data  = Split_data.split_data(data_extractor.positive().documents(),ratio)
                    train_neg, self.neg_testing_data  = Split_data.split_data(data_extractor.negative().documents(),ratio)
                #extract only training data
                else:
                    train_pos  = Split_data.split_data(data_extractor.positive().documents(),ratio)[0]
                    train_neg  = Split_data.split_data_mod(data_extractor.negative().documents(),ratio)[0]
                #extract 25% of from training data to normalise size of training set          
                new_train_pos  = Split_data.split_data(train_pos, 0.25)[0]
                new_train_neg  = Split_data.split_data(train_neg, 0.25)[0]
            
                #add to final training set
                self.pos_training_data = self.pos_training_data + new_train_pos   
                self.neg_training_data = self.neg_training_data + new_train_neg                                        
             
        else:
                #extracting train data from the first domain
            training_extractor = self.CorpusReaders[train_domain]  
            self.pos_training_data = Split_data.split_data(training_extractor.positive().documents(),ratio)[0]
            self.neg_training_data = Split_data.split_data(training_extractor.negative().documents(),ratio)[0]
        
            #extracting test data from the second domain
            testing_extractor = self.CorpusReaders[test_domain]         
            train_pos, self.pos_testing_data  = Split_data.split_data(testing_extractor.positive().documents(),ratio)
            train_neg, self.neg_testing_data  = Split_data.split_data(testing_extractor.negative().documents(),ratio)    
      
    #plot results
    def plot_results(self, results, title,xlabels,ylabel="Accuracy"):   
        '''Plot a bar graph of results'''   
        ind = numpy.arange(len(results))    
        width = 0.4
        plt.bar(ind,results,width,color="#1AADA4")    
        plt.ylabel(ylabel)
        
        ymin = (numpy.min(results)*0.8)
        ymax=(numpy.max(results)*1.2)
        plt.ylim(ymin,ymax)  
        
        plt.xticks(ind+width/2.0,xlabels)    
        plt.title(title)    
        plt.show()
    
    #plot linear regression     
    def plot_regression(self,x,y, ylabel_):

        A = numpy.vstack([x, numpy.ones(len(x))])
        
        w = numpy.linalg.lstsq(A.T,y)[0]
        
        line = w[0]*x+w[1] # regression line
        plt.ylabel(ylabel_)
        plt.xlabel("Number documents in training set")
        plt.plot(x, line,'r-', x, y,'o')
        plt.show()
Exemple #7
0
    pyplot.grid(True)
    pyplot.xticks(range(1,num_of_ranks+1,2),range(1,num_of_ranks+1,2))
    pyplot.xlim([0,num_of_ranks+2])
    if show_values:
        for xi,yi in zip(x,y):
            pyplot.text(xi+0.25,yi+50,yi,verticalalignment="bottom",rotation=55,fontsize="small")
    pyplot.show()
    print "Plot complete."
   
print zipf_dist(fdist_text5)   
print zipf_dist(filtered_fdist_text5) 


#AMAZON
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
arcr = AmazonReviewCorpusReader()
positive_reviews = arcr.positive()
negative_reviews = arcr.negative()
dvd_reviews = arcr.category("dvd")
positive_dvd_reviews = dvd_reviews.positive()
tokens1 = positive_dvd_reviews.sample_words(102152)
print len(tokens1)
print len(set(tokens1))
print len(tokens1) / len(set(tokens1))
from nltk.corpus import stopwords
filtered_tokens1 = [w for w in tokens1 if w.isalpha() and w not in stopwords.words('english')]
print len(filtered_tokens1)
print len(set(filtered_tokens1))
print len(filtered_tokens1) / len(set(filtered_tokens1))
X = set(filtered_tokens1)
long_wordsA = [w for w in X if len(w) > 15]
Exemple #8
0
    return reduce(lambda words,review: words+review.words(), amazon_reviews, [])

def feature_extractor(amazon_review):
    # Extract all words from the review
    list_of_words = amazon_review.words()
    #Get lowercase versions of all the words  
    lowercase_words = [word.lower() for word in list_of_words]
    #Replace all number tokens with "NUM"
    words_numbers_removed  = ["NUM" if word.isdigit() else word for word in lowercase_words]
    # Filter out non-alphabetic words and stopwords.
    words = [word for word in words_numbers_removed if word.isalpha() and word not in stopwords.words('english')]
    return words
 

#Create an Amazon corpus reader pointing at only book reviews
book_reader = AmazonReviewCorpusReader().category("book")
 
#In order to get even random splits, where each data set is a list of Amazon Review objects.
pos_training_data, pos_testing_data = split_data(book_reader.positive().documents()) #See the note above this code snippet for a description of the "documents" method.
neg_training_data, neg_testing_data = split_data(book_reader.negative().documents())

#Get some extra book data
extra_dvd_positive = [r for r in book_reader.unlabeled(["book"]).documents() if r.rating() > 4.9 ]
extra_dvd_negative = [r for r in book_reader.unlabeled(["book"]).documents() if r.rating() < 1.1 ]
 
#You can also combine the training data
training_data = pos_training_data + neg_training_data
testing_data = pos_testing_data + neg_testing_data

data_to_shuffle = training_data
shuffled_data = shuffle(training_data)
Exemple #9
0
'''
Created on 30 Sep 2013

@author: el271
'''
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader     #import reader class

arcr = AmazonReviewCorpusReader()         #create new reader
 
positive_reviews = arcr.positive()         #store a reader pointing at all positive reviews
negative_reviews = arcr.negative()         #pointing at all negative
dvd_reviews = arcr.category("dvd")          #pointing at all dvd
positive_dvd_reviews = dvd_reviews.positive()     #pointing at all postive dvd
 
 
from sussex_nltk.corpus_readers import ReutersCorpusReader   

rcr = ReutersCorpusReader()            #Create new reader
sport_cr = rcr.category("sport")       #Create a reader pointing at sport articles only
finance_cr = rcr.category("finance")


from sussex_nltk.corpus_readers import WSJCorpusReader   #import the corpus reader
wsjcr = WSJCorpusReader()  

from sussex_nltk.corpus_readers import MedlineCorpusReader  #import the reader
mcr = MedlineCorpusReader()         #create a new corpus reader

from sussex_nltk.corpus_readers import TwitterCorpusReader   #import the corpus reader
tcr = TwitterCorpusReader()