Beispiel #1
0
    def preprocess(self, corpus, language='ko'):
        pipeline = None

        if language == 'ko':
            mecab_path = 'C:\\mecab\\mecab-ko-dic'
            pipeline = pre.Pipeline(
                pre.splitter.NLTK(), pre.tokenizer.MeCab(mecab_path),
                pre.helper.POSFilter('NN*'), pre.helper.SelectWordOnly(),
                pre.ngram.NGramTokenizer(1, 2),
                pre.helper.StopwordFilter(file='../../stopwordsKor.txt'))
        elif language == 'en':
            pipeline = pre.Pipeline(
                pre.splitter.NLTK(), pre.tokenizer.WordPos(),
                pre.helper.POSFilter('NN*'), pre.helper.SelectWordOnly(),
                pre.ngram.NGramTokenizer(1, 2),
                pre.helper.StopwordFilter(file='../../stopwordsEng.txt'))
        result = pipeline.processCorpus(corpus)
        print('==  ==')

        documents = []
        for doc in result:
            document = ''
            for sent in doc:
                document += " ".join(sent)
            documents.append(document)

        return documents
def process_by_date_range(startDate, endDate):
    filter_by_date_range(startDate, endDate)

    user_dict = './user_dic.txt'

    # TODO: Customize pre-processing pipeline
    pipeline = pre.Pipeline(pre.splitter.NLTK(),
                            pre.tokenizer.WordPos(),
                            pre.lemmatizer.WordNet(),
                            pre.helper.POSFilter('N*|J*|R*|V*'),
                            pre.helper.SelectWordOnly(),
                            pre.helper.StopwordFilter(file='./stopwordsEng.txt'),
                            pre.ngram.NGramTokenizer(1, 2),
                            pre.counter.WordCounter())

    filePath1 = "./data/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_data.txt"

    corpus = pre.CorpusFromFieldDelimitedFile(filePath1, 0)

    if os.path.exists(filePath1):
        # os.remove(filePath1)
        print(filePath1, "is now being processed!\n")
    else:
        print("File does not exist!")

    result = pipeline.processCorpus(corpus)

    print(result)
    print()

    doc_collection = ''
    term_counts = {}
    for doc in result:
        for sent in doc:
            for _str in sent:
                term_counts[_str[0]] = term_counts.get(_str[0], 0) + int(_str[1])
                freq = range(int(_str[1]))
                co = ''
                for n in freq:
                    co += ' ' + _str[0]

                doc_collection += ' ' + co

    word_freq = []
    for key, value in term_counts.items():
        word_freq.append((value, key))

    word_freq.sort(reverse=True)
    print(word_freq)

    filePath2 = "./result/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_result.txt"

    f = open(filePath2, "w", encoding='utf8')
    for pair in word_freq:
        f.write(pair[1] + '\t' + str(pair[0]) + '\n')
    f.close()

    return doc_collection
    def __init__(self, records_file, batch_size, image_shape):
        """
    Args:
      records_file: The TFRecords file to read data from.
      batch_size: The size of batches to read.
      image_shape: The shape of images to load. """
        if not accessible_path(records_file):
            # If we don't check this, TensorFlow gives us a really confusing and
            # hard-to-debug error later on.
            raise ValueError("File '%s' does not exist." % (records_file))
        if len(image_shape) != 3:
            raise ValueError("Image shape must be of length 3.")

        self._image_shape = image_shape
        self._records_file = records_file
        self._batch_size = batch_size

        # Create a default preprocessing pipeline.
        self.__pipeline = preprocess.Pipeline()
# -*- encoding:utf8 -*-
import preprocess as pre
import networkx as nx
from matplotlib import pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib as mpl

if __name__ == '__main__':

    corpus = pre.CorpusFromFieldDelimitedFile('../data/ALLre_date_content.txt',
                                              1)

    pipeline = pre.Pipeline(
        pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.lemmatizer.WordNet(),
        pre.helper.POSFilter('N*', 'V*', 'J*'), pre.helper.SelectWordOnly(),
        pre.helper.StopwordFilter(file='../stopwordsEng.txt'))
    #                            pre.ngram.NGramTokenizer(1, 2))

    result = pipeline.processCorpus(corpus)
    print('== 전처리 완료 ==')
    print(result)
    print()

    file = open('ALLre_all_AVJ_pre.txt', 'w')
    file.write(result)
    file.close()

    print('==  ==')

    documents = []
Beispiel #5
0
        dump(dataset, open(filename, 'wb'))
        print('Saved: %s' % filename)

    def load_dataset(self, filename):
        # load the model from disk
        loaded_model = pickle.load(open(filename, 'rb'))
        return loaded_model


if __name__ == '__main__':

    _negative_docs = pre.CorpusFromDirectory('../txt_sentoken/neg', True)
    _positive_docs = pre.CorpusFromDirectory('../txt_sentoken/pos', True)

    pipeline = pre.Pipeline(pre.splitter.NLTK(),
                            pre.tokenizer.Word(),
                            pre.helper.StopwordFilter(file='../../stopwordsEng.txt'),
                            pre.stemmer.Porter())
    _neg_result = pipeline.processCorpus(_negative_docs)
    _pos_result = pipeline.processCorpus(_positive_docs)
    print('== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter ==')
    print(_neg_result)
    print()

    negative_docs = list()
    for doc in _neg_result:
        new_doc = []
        for sent in doc:
            for _str in sent:
                if len(_str) > 0:
                    new_doc.append(_str)
        negative_docs.append(' '.join(new_doc))
def getSentimentScoreByFile(filePath, windowSize=1):
    if windowSize < 0:
        print("Wrong window size")
        exit(1)

    corpus = pre.CorpusFromFile(filePath)

    pipeline = pre.Pipeline(
        pre.splitter.NLTK(),
        # pre.tokenizer.WordPos(),
        pre.tokenizer.Word(),
        pre.helper.StopwordFilter(file='../stopwordsEng.txt'),
        pre.tagger.NLTK(),
        pre.lemmatizer.WordNet())

    result = pipeline.processCorpus(corpus)
    print(result)

    EnglishDictionarySentimentAnalyzer().createDictionary()

    final_grand_score = 0  # file level
    final_count = 0  # file level
    final_score_array = []  # file level
    for document in result:
        convertedDocument = document

        # merge sentences in each document by window size
        if windowSize > 1:
            sentences = []
            for sent in document:
                sentences.append(sent)

            if len(sentences) < windowSize:
                print("Window size is larger than the number of sentences")
                print(
                    "Window size will be set as 1 (default) for this document")
            else:
                newArray = []
                for a in range(0, len(sentences) - windowSize + 1):
                    tempArray = []
                    for b in range(0, windowSize):
                        for element in sentences[a + b]:
                            tempArray.append(element)
                    # print("tempArray: ", end="")
                    # print(tempArray)
                    newArray.append(tempArray)

                # print("newArray: ", end="")
                # print(newArray)
                convertedDocument = newArray

        grand_score, count = getSentimentScoreByDocument(convertedDocument)

        if count > 0:
            doc_avg_score = grand_score / count
            print("Average Sentiment Score: " + str(doc_avg_score))
            final_grand_score += doc_avg_score
            final_count += 1
            final_score_array.append(str(doc_avg_score))
        else:
            print("This document is empty")

    try:
        final_avg_score = final_grand_score / final_count
        return str(final_avg_score), final_score_array
    except ZeroDivisionError:
        return str(0), ["This file is empty"]
def getKeywordSentimentScoreByFile(filePath, windowSize=1):
    if windowSize < 0:
        print("Wrong window size")
        exit(1)

    corpus = pre.CorpusFromFile(filePath)

    pipeline = pre.Pipeline(
        pre.splitter.NLTK(),
        # pre.tokenizer.WordPos(),
        pre.tokenizer.Word(),
        pre.helper.StopwordFilter(file='../stopwordsEng.txt'),
        pre.tagger.NLTK(),
        pre.lemmatizer.WordNet())

    result = pipeline.processCorpus(corpus)
    #print(result)

    EnglishDictionarySentimentAnalyzer().createDictionary()

    final_grand_score = 0  # file level
    final_count = 0  # file level
    final_score_array = []  # file level
    for document in result:
        #convertedDocument = document

        # merge sentences in each document by window size
        # filter sentences by keyword
        if windowSize > 1:
            sentences = []
            keyword = ["Korea", "Koreans", "Korean"]

            for sent in document:
                num = 0
                sentences1 = []
                while num < len(sent):
                    k = 0
                    while k < len(keyword):
                        if keyword[k] in sent[num]:
                            if document.index(sent) > windowSize:
                                sents = document[document.index(sent) -
                                                 windowSize:document.
                                                 index(sent) + windowSize]
                            else:
                                sents = document[0:document.index(sent) +
                                                 windowSize]
                            sentences1.extend(sentences)
                            sentences.extend(sents)
                            break
                        else:
                            k += 1
                            pass
                    if sentences1 != sentences:
                        break
                    else:
                        num += 1
                        pass

            #print("===sentences===")
            #print(sentences)
            convertedDocument = sentences

        grand_score, count = getSentimentScoreByDocument(convertedDocument)

        if count > 0:
            doc_avg_score = grand_score / count
            print("Average Sentiment Score: " + str(doc_avg_score))
            final_grand_score += doc_avg_score
            final_count += 1
            final_score_array.append(str(doc_avg_score))
        else:
            print("This document is empty")

    try:
        final_avg_score = final_grand_score / final_count
        return str(final_avg_score), final_score_array
    except ZeroDivisionError:
        return str(0), ["This file is empty"]
Beispiel #8
0
#!/usr/bin/python3
# Author: Suzanna Sia

### Standard imports
import numpy as np
import pdb
import os
import sys
import json

import preprocess

INDEX = "coe"
pipe = preprocess.Pipeline()


def docs_to_json(target_doc_fol):
    jsonl = []
    for fil in os.listdir(target_doc_fol):
        dd = {}
        with open(os.path.join(target_doc_fol, fil), 'r') as f:
            text = f.readlines()
            fn = fil[:fil.find('.')]
            dd['_id'] = fn
            dd['doc_text'] = pipe.strip_clean(" ".join(text))
            dd['docid'] = fn
            dd['_index'] = "coe"  # refac
        jsonl.append(dd)
    return jsonl

Beispiel #9
0
    file_name = './data/emo_positive.txt'
    sentiAnalyzer.readPositiveEmotiDictionary(file_name)
    file_name = './data/polarity.csv'
    sentiAnalyzer.readPolarityDictionary(file_name)

    dict_list = sentiAnalyzer.getSentiDictionary()

    pipeline = None
    # corpus = pre.CorpusFromFieldDelimitedFile('../data/donald.txt', 2)
    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    mode = 'korean_lemmatizer'
    if mode is not 'korean_lemmatizer':
        pipeline = pre.Pipeline(
            pre.splitter.NLTK(),
            pre.tokenizer.MeCab(mecab_path),
            # pre.tokenizer.Komoran(),
            pre.helper.SelectWordOnly(),
            pre.ngram.NGramTokenizer(1, 2, concat=' '),
            pre.helper.StopwordFilter(file='../stopwordsKor.txt'))
    else:
        pipeline = pre.Pipeline(
            pre.splitter.NLTK(),
            pre.tokenizer.MeCab(mecab_path),
            # pre.tokenizer.Komoran(),
            pre.lemmatizer.SejongPOSLemmatizer(),
            pre.helper.SelectWordOnly(),
            # pre.ngram.NGramTokenizer(1, 2, concat=' ')),
            pre.helper.StopwordFilter(file='../stopwordsKor.txt'))

    # documents = ['오늘은 비가와서 그런지 매우 우울하다',
    #              '시험이 끝나야 놀지 스트레스 받아ㅠㅠ',
    return _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels


if language == 'en':
    _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs \
                                                                         = read_english_corpus()
elif language == 'ko':
    _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels \
                                                                         = read_korean_corpus()
if language == 'ko':
    mecab_path = 'C:\\mecab\\mecab-ko-dic'
    pipeline = pre.Pipeline(pre.splitter.NLTK(),
                            pre.tokenizer.MeCab(mecab_path),
                            pre.helper.POSFilter('NN*'),
                            pre.helper.SelectWordOnly(),
                            pre.ngram.NGramTokenizer(1, 2),
                            pre.helper.StopwordFilter(file='../../stopwordsKor.txt')
                            )
elif language == 'en':
    pipeline = pre.Pipeline(pre.splitter.NLTK(),
                            pre.tokenizer.WordPos(),
                            pre.helper.POSFilter('NN*|A*|V*|J*'),
                            pre.helper.SelectWordOnly(),
                            # pre.ngram.NGramTokenizer(1, 2),
                            pre.helper.StopwordFilter(file='../../stopwordsEng.txt')
                            )


def make_documents(result):
    docs = []