Ejemplo n.º 1
0
class SentimentAnalyzer():

    # @param safolder folder where the training set resides, and where to put the test result
    # @param model algorithm used for sentiment analysis
    def __init__(self, dir, model):
        self.dir = dir
        self.model = model
        self.preprocessor = Preprocessor(dir)

        # open existing files if a model has been built before. no need to reprocess
        self.classifier = pickle.load(open(f"{self.dir}/training/model/{model}_clf.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/{model}_clf.pickle") else None
        self.features = pickle.load(open(f"{self.dir}/training/model/features.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/features.pickle") else []
        self.classes = pickle.load(open(f"{self.dir}/training/model/classes.pickle", "rb")) \
            if os.path.isfile(f"{self.dir}/training/model/classes.pickle") else []
        self.most_common_words = [w.strip() for w in open(f"{self.dir}/training/model/most_common_words.txt","r",encoding="utf8").readlines()] \
            if os.path.isfile(f"{self.dir}/training/model/most_common_words.txt") else []

    # remove most common words (top 1%) that appear in both positive and negative documents
    def _remove_most_common_words(self, documents):
        print("Define most common words...")

        most_common_words = set([])
        for cls in self.classes:
            docs = [d for d in documents if d.sentiment == cls]

            doc_words = [
                w for d in docs
                for w in word_tokenize(d.content.replace(".", ""))
            ]
            fdist = nltk.FreqDist(doc_words)

            if len(most_common_words) == 0:
                most_common_words = set([
                    w[0] for w in fdist.most_common(int(0.01 * len(doc_words)))
                ])
            else:
                most_common_words = set([
                    w[0] for w in fdist.most_common(int(0.01 * len(doc_words)))
                    if w[0] in most_common_words
                ])

        self.most_common_words = most_common_words
        with open(f"{self.dir}/training/model/most_common_words.txt",
                  "w",
                  encoding="utf8") as writer:
            writer.writelines([f"{w}\n" for w in self.most_common_words])

        ndocs = []
        doc_count = 0
        for d in documents:
            doc_count += 1
            ncontent = " ".join([
                w for w in word_tokenize(d.content)
                if w not in most_common_words
            ])

            ndocs.append(Document(d.name, ncontent, d.sentiment, d.location))
            print("\r", end="")
            print("Removing most common words progress",
                  int(doc_count / len(documents) * 100),
                  "%",
                  end="",
                  flush=True)
        print("")
        return ndocs

    # only keep adjectives, adverbs, and nouns
    def _reduce_dimension_by_postag(self, documents):
        reduced_documents = []

        doc_count = 0
        for doc in documents:
            reduced_sentence = " ".join([
                p[0] for p in nltk.pos_tag(
                    word_tokenize(doc.content.replace(".", "")))
                if p[1] in preprocess.ADJ or p[1] in preprocess.ADV
                or p[1] in preprocess.NOUN
            ])

            if not reduced_sentence.isspace():
                reduced_documents.append(
                    Document(doc.name, reduced_sentence, doc.sentiment,
                             doc.location))

            doc_count += 1
            print("\r", end="")
            print("Reducing dimension in progress",
                  int(doc_count * 100 / len(documents)),
                  "%",
                  end="",
                  flush=True)
        print("")

        return reduced_documents

    def create_frequency_plot(self, words, top_k):
        p = nltk.FreqDist(words)
        p.plot(top_k)

    def _undersample(self, documents):

        # find the minimum number of documents in a class

        docs_by_class = []
        minclass_length = len(documents)
        for cls in self.classes:
            docs = [d for d in documents if d.sentiment == cls]
            docs_by_class.append(docs)

            if len(docs) < minclass_length:
                minclass_length = len(docs)

        # sample all classes based on the minimum number of documents
        undersampled_docs = []
        for docs in docs_by_class:
            random.shuffle(docs)
            undersampled_docs.extend(docs[:minclass_length])

        return undersampled_docs

    # preprocessing
    def prepare_documents(self):
        documents = []

        for file in os.listdir(f"{self.dir}/training/data"):
            documents.extend(
                pickle.load(open(f"{self.dir}/training/data/{file}", "rb")))

        if len(self.classes) == 0:
            self.classes = set([doc.sentiment for doc in documents])
            pickle.dump(
                self.classes,
                open(f"{self.dir}/training/model/classes.pickle", "wb"))

        print("Perform undersampling...")
        documents = self._undersample(documents)

        documents = self._reduce_dimension_by_postag(documents)

        documents = self._remove_most_common_words(documents)

        return documents

    def transform_into_featuresets(self, documents):

        self.features = set(
            [w for d in documents for w in set(word_tokenize(d.content))])
        pickle.dump(self.features,
                    open(f"{self.dir}/training/model/features.pickle", "wb"))
        print("Features length:", len(self.features))
        featuresets = []

        print("Transforming into featuresets....")
        doc_count = 0
        for doc in documents:
            # checking whether a word exists in an array takes a significantly longer time
            # thus we check whether a word exists in a string
            featuresets.append(({
                w: True
                for w in word_tokenize(doc.content) if w in self.features
            }, doc.sentiment))
            doc_count += 1

            print("\r", end='')
            print("Preparing featureset in progress",
                  int(doc_count * 100 / len(documents)),
                  "%",
                  end='',
                  flush=True)
        print("")

        return featuresets

    def get_training_validation_set(self, featuresets, valid_ratio):
        if len(self.classes) == 0:
            classes = set([f[1] for f in featuresets])
            pickle.dump(
                self.classes,
                open(f"{self.dir}/training/model/classes.pickle", "wb"))

        trainingset = []
        validset = []

        for c in self.classes:
            subfeat = [f for f in featuresets if f[1] == c]
            random.shuffle(subfeat)

            trainct = int((1 - valid_ratio) * len(subfeat))
            trainingset.extend(subfeat[:trainct])
            validset.extend(subfeat[trainct:])

        return trainingset, validset

    def train(self, validation_ratio):
        os.makedirs(os.path.dirname(f"{self.dir}/training/model/"),
                    exist_ok=True)

        documents = self.prepare_documents()
        featuresets = self.transform_into_featuresets(documents)
        trainset, validset = self.get_training_validation_set(
            featuresets, validation_ratio)

        print("Building classifier...")
        if self.model == "NB":
            self.classifier = nltk.NaiveBayesClassifier.train(trainset)
            self.classifier.show_most_informative_features(15)
        elif self.model == "MNB":
            self.classifier = SklearnClassifier(
                MultinomialNB()).train(trainset)
        elif self.model == "SVM":
            self.classifier = SklearnClassifier(SVC()).train(trainset)
        elif self.model == "LR":
            self.classifier = SklearnClassifier(
                LogisticRegression()).train(trainset)

        print("Accuracy per class")
        for cls in self.classes:
            print(f"{cls} accuracy:", (nltk.classify.accuracy(
                self.classifier, [v for v in validset if v[1] == cls])) * 100)
        print("Classifier accuracy percent:",
              (nltk.classify.accuracy(self.classifier, validset)) * 100)
        pickle.dump(
            self.classifier,
            open(f"{self.dir}/training/model/{self.model}_clf.pickle", "wb"))

    def show_most_informative_features(self, n):
        self.classifier.show_most_informative_features(n)

    def sentiment(self, text):
        # to ensure that the word is lemmatized properly so it is detected in self.features
        cleaned_text = self.preprocessor.basic_preprocess(text).replace(
            ".", "")

        # no need advanced self processing because the features have been determined
        feature = {
            w: True
            for w in word_tokenize(cleaned_text) if w in self.features
        }
        prob_dict = self.classifier.prob_classify(feature)

        cls = prob_dict.max()
        prob = prob_dict.prob(cls)

        return cls, prob

    def classify(self, test_dir):
        print("Start classifying...")

        if self.classifier == None:
            self.train(0.2)
        else:
            self.classifier.show_most_informative_features(15)

        files = [
            os.path.basename(x)
            for x in glob.glob(f"{self.dir}/{test_dir}/data/*.csv")
        ]
        done_files = [f.strip() for f in open(f"{self.dir}/testing/classify_done.txt", 'r').readlines()] \
            if os.path.isfile(f"{self.dir}/testing/classify_done.txt") else []
        tbp_files = [f for f in files if f not in done_files]

        headers = [
            "review_page", "review_title", "review_content", "review_star",
            "reviewer_location", "review_date", "crawled_date"
        ]
        os.makedirs(os.path.dirname(f"{self.dir}/{test_dir}/results/"),
                    exist_ok=True)

        for file in tbp_files:
            with open(f"{self.dir}/{test_dir}/data/{file}",
                      "r",
                      encoding="utf8") as f:
                csvreader = csv.DictReader(f)

                with open(f"{self.dir}/{test_dir}/results/{file}","w", encoding="utf8", newline="") \
                        as w:
                    csvwriter = csv.writer(w)
                    csvwriter.writerow(headers)

                    rowid = 0
                    rownum = self.preprocessor.count_lines(
                        f"{self.dir}/{test_dir}/data/{file}")
                    for row in csvreader:
                        review_page = row["review_page"]
                        review_title = row["review_title"]
                        review_content = row["review_content"]

                        cat = self.sentiment(
                            f"{row['review_title']}. {row['review_content']}")
                        review_star = "45" if cat[0] == "pos" else "20"

                        reviewer_location = row["user_location"]
                        review_date = row["review_date"]
                        crawled_date = "00000000"

                        csvwriter.writerow([
                            review_page, review_title, review_content,
                            review_star, reviewer_location, review_date,
                            crawled_date
                        ])

                        w.flush()

                        rowid += 1
                        print("\r", end='')
                        print("Classifying in progress",
                              int(rowid * 100 / rownum),
                              "% for",
                              file,
                              end='',
                              flush=True)

            with open(f"{self.dir}/testing/classify_done.txt",
                      "a",
                      encoding="utf8") as writer:
                writer.write(f"{file}\n")
Ejemplo n.º 2
0
	print "creating feature sets..."
	tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv')
	labeld_features = label_feats_from_tweets(tweetlist)
	#labeld_features = label_feats_from_corpus(movie_reviews)
	training_set, test_set = split_label_feats(labeld_features)

	# tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv')
	# training_set = label_feats_from_tweets(tweetlist)
	# training_set, garbage = split_label_feats(training_set, 1.0)
	# test_set, garbage = split_label_feats(labeld_features, 1.0)

	print "training set length: %i  test set length: %i" % (len(training_set), len(test_set))
	print prettifyFeatureSet(test_set)
	print "training classifier..."
	#classifier = NaiveBayesClassifier.train(training_set)
	#classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01)
	#classifier = MaxentClassifier.train(training_set)
	classifier = SklearnClassifier(LogisticRegression()).train(training_set)
	print "calculating accuracy..."
	print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set)
	#classifier.show_most_informative_features(30)

	negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
	print classifier.classify(negfeat)
	probdist =  classifier.prob_classify(negfeat)
	print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg')
	print classifier.labels()
	classify_tweet(classifier, "I love this movie!", True)
	classify_tweet(classifier, "!!!", True)

Ejemplo n.º 3
0
    print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test))
    print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test))
    print('SDGC_classifier accuracy: ',nltk_accuracy(SDGC_classifier,features_test))
    print('SVC_classifier accuracy: ',nltk_accuracy(SVC_classifier,features_test))
    print('LSVC_classifier accuracy: ',nltk_accuracy(LSVC_classifier,features_test))
    print('NuSVC_classifier accuracy: ',nltk_accuracy(NuSVC_classifier,features_test))
    
    # Test input movie reviews
    with open('text.txt','r',encoding='utf-8') as f1:
        input_reviews = sent_tokenize(f1.read())

    f1.close()

    f = open('result.txt','w',encoding='utf-8')
    f.write("Review\tPredicted sentiment\tProbability\n")
    for review in input_reviews:
        review = review.replace('\n',' ')

        f.write(review + '\t')

        # Compute the probabilities
        probabilities = LGR_classifier.prob_classify(extract_features(review.split()))

        # Pick the maximum value
        predicted_sentiment = probabilities.max()

        # Print outputs
        f.write(predicted_sentiment + '\t')
        f.write('{}'.format(round(probabilities.prob(predicted_sentiment), 2)) + '\n')
    
    f.close()
Ejemplo n.º 4
0
class RForests(text_classifier.TextClassifier):
    def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
        self.classifier = None
        self.labelFile = labelFile
        self.trainingDir = trainDir
        self.labels = None
        self.all_words = None
        self.numTrees = numTrees
        self.numJobs = numJobs
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees,
                                            n_jobs=numJobs),sparse=False)
        #self.labels = training.setup(labelFile)
        #self.train()
    
    def train(self):
        feature_sets = self.getFeatures()
        self.classifier.train(feature_sets)
        
    """ Determines training error"""
    def trainingError(self):
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(RandomForestClassifier(
                                                    n_estimators=self.numTrees),
                                                    sparse=False)
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(train_set)
                p = nltk.classify.accuracy(self.classifier,test_set)
                print len(train_set),len(test_set),p
                total+=p
            accuracies.append(total/numTrials)
        return accuracies
    
    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees),sparse=False)
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.batch_classify(features)   
        return ref_labels,pred_labels
    
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm

    def prob_classify(self,db,fastain):
        proIDs,pds,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            proteinID = toks[5]
            query_rows = genbank.proteinQuery(proteinID,db)
            ids,text = zip(*query_rows)
            text = ''.join(map(str,text))
            if text=='': 
                label = ['na']
                pd = None
            else:
                text = word_reg.findall(text)
                
            
                featureset = self.gene_features(text)
                assert text!=prevText
                assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.batch_classify(featureset)    
                pd = self.classifier.prob_classify([featureset])[0]
                    
            proIDs.append(proteinID)  
            pds.append(pd)
            labels+=label
        return proIDs,labels,pds

    def classifyPickle(self,pickle,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        gbkTable = genbank.GenBankTable()
        gbkTable.load(pickle)
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            locus_tag = toks[5]
            text = gbkTable.getLocusText(locus_tag)
            if text=='': 
                label = 'na'
            else:
                text = word_reg.findall(text)
                featureset = self.gene_features(text)
                #assert text!=prevText
                #assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.classify(featureset)    
                #print label,text
            proIDs.append(locus_tag)  
            labels.append(label)
        return zip(proIDs,labels)
        
    """ Classifies proteins based on its text from sqlite3 database"""
    def classifyDB(self,db,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            locus_tag = toks[5]
            locus_rows = genbank_sqlite3.locusQuery(locus_tag,db)
            protein_rows = []
            for row in locus_rows:
                locus,proteinID = row
                query_rows = genbank_sqlite3.proteinQuery(proteinID,db)
                protein_rows+=query_rows
            #print len(protein_rows),locus_tag
            if len(protein_rows)==0:
                label = 'na'
            else:
                ids,text = zip(*protein_rows)
                text = ''.join(map(str,text))
                if text=='': 
                    label = 'na'
                else:
                    text = word_reg.findall(text)
                    featureset = self.gene_features(text)
                    #assert text!=prevText
                    #assert featureset!=prevFeatureset
                    prevFeatureset = featureset
                    prevText = text
                    label = self.classifier.classify(featureset)    
                    #print label,text
            proIDs.append(locus_tag)  
            labels.append(label)
        return zip(proIDs,labels)

    def classify(self,dbin,fastain,type='sqlite3'):
        if type=='sqlite3':
            return self.classifyDB(dbin,fastain)
        else:
            return self.classifyPickle(dbin,fastain)
Ejemplo n.º 5
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    util.DPRINT = args.dprint
    featureset_name = os.path.basename(args.featurefn).split('.')[0]
    features.load_featurefile(args.featurefn)

    ## default is 1e-4.
    THETOL = 1e-3
    classifier_pairs = []
    classifier_pairs.append(("MFS", learn.MFSClassifier()))

    classifier = SklearnClassifier(LogisticRegression(C=1,
                                   penalty='l2',
                                   tol=THETOL))
    classifier_pairs.append(("maxent-l2-c1", classifier))
    stamp = util.timestamp()

    for fn in glob(args.testset + "/*data"):
        problems = semeval_testset.extract_wsd_problems(fn)

        w = problems[0][0]
        assert w.endswith(".n")
        w = w[:-2]
        load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn)

        bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
        oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
        if os.path.exists(bestoutfn):
            os.remove(bestoutfn)
        if os.path.exists(oofoutfn):
            os.remove(oofoutfn)

        training = None

        for problem in problems:
            w = problem[0]
            assert w.endswith(".n")
            w = w[:-2]
            print(problem)

            if training is None:
                training = trainingdata.trainingdata_for(w, nonnull=True)
                print("got {0} instances for {1}".format(len(training), w))
                labels = set(label for (feat,label) in training)
                if len(training) == 0:
                    print("no samples for", w)
                    break
                if len(labels) < 2:
                    print("there's only one sense for", w, " and it is ",
                          labels)
                    break
                classifier.train(training)

            rawtext = problem[2]
            surface, index = semeval_testset.head_surface_and_index(rawtext)
            replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext)
            annotated = preprocessing.preprocess(replaced, "en")
            sentence = [token.lemma for token in annotated]

            focus_index = find_head_token_index(annotated, surface, index)
            feats = features.extract_untagged(sentence, annotated, focus_index)

            bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
            oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
            with open(bestoutfn, "a") as bestoutfile, \
                 open(oofoutfn, "a") as oofoutfile:

                answer = classifier.classify(feats)
                print(answer)
                dist = classifier.prob_classify(feats)
                oof_answers = topfive(dist)
                print(output_one_best(problem, "es", answer), file=bestoutfile)
                print(output_five_best(problem, "es", oof_answers),
                      file=oofoutfile)