def preprocess(self, corpus, language='ko'): pipeline = None if language == 'ko': mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.MeCab(mecab_path), pre.helper.POSFilter('NN*'), pre.helper.SelectWordOnly(), pre.ngram.NGramTokenizer(1, 2), pre.helper.StopwordFilter(file='../../stopwordsKor.txt')) elif language == 'en': pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.helper.POSFilter('NN*'), pre.helper.SelectWordOnly(), pre.ngram.NGramTokenizer(1, 2), pre.helper.StopwordFilter(file='../../stopwordsEng.txt')) result = pipeline.processCorpus(corpus) print('== ==') documents = [] for doc in result: document = '' for sent in doc: document += " ".join(sent) documents.append(document) return documents
def process_by_date_range(startDate, endDate): filter_by_date_range(startDate, endDate) user_dict = './user_dic.txt' # TODO: Customize pre-processing pipeline pipeline = pre.Pipeline(pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.lemmatizer.WordNet(), pre.helper.POSFilter('N*|J*|R*|V*'), pre.helper.SelectWordOnly(), pre.helper.StopwordFilter(file='./stopwordsEng.txt'), pre.ngram.NGramTokenizer(1, 2), pre.counter.WordCounter()) filePath1 = "./data/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_data.txt" corpus = pre.CorpusFromFieldDelimitedFile(filePath1, 0) if os.path.exists(filePath1): # os.remove(filePath1) print(filePath1, "is now being processed!\n") else: print("File does not exist!") result = pipeline.processCorpus(corpus) print(result) print() doc_collection = '' term_counts = {} for doc in result: for sent in doc: for _str in sent: term_counts[_str[0]] = term_counts.get(_str[0], 0) + int(_str[1]) freq = range(int(_str[1])) co = '' for n in freq: co += ' ' + _str[0] doc_collection += ' ' + co word_freq = [] for key, value in term_counts.items(): word_freq.append((value, key)) word_freq.sort(reverse=True) print(word_freq) filePath2 = "./result/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_result.txt" f = open(filePath2, "w", encoding='utf8') for pair in word_freq: f.write(pair[1] + '\t' + str(pair[0]) + '\n') f.close() return doc_collection
def __init__(self, records_file, batch_size, image_shape): """ Args: records_file: The TFRecords file to read data from. batch_size: The size of batches to read. image_shape: The shape of images to load. """ if not accessible_path(records_file): # If we don't check this, TensorFlow gives us a really confusing and # hard-to-debug error later on. raise ValueError("File '%s' does not exist." % (records_file)) if len(image_shape) != 3: raise ValueError("Image shape must be of length 3.") self._image_shape = image_shape self._records_file = records_file self._batch_size = batch_size # Create a default preprocessing pipeline. self.__pipeline = preprocess.Pipeline()
# -*- encoding:utf8 -*- import preprocess as pre import networkx as nx from matplotlib import pyplot as plt import numpy as np from sklearn.feature_extraction.text import CountVectorizer import matplotlib as mpl if __name__ == '__main__': corpus = pre.CorpusFromFieldDelimitedFile('../data/ALLre_date_content.txt', 1) pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.lemmatizer.WordNet(), pre.helper.POSFilter('N*', 'V*', 'J*'), pre.helper.SelectWordOnly(), pre.helper.StopwordFilter(file='../stopwordsEng.txt')) # pre.ngram.NGramTokenizer(1, 2)) result = pipeline.processCorpus(corpus) print('== 전처리 완료 ==') print(result) print() file = open('ALLre_all_AVJ_pre.txt', 'w') file.write(result) file.close() print('== ==') documents = []
dump(dataset, open(filename, 'wb')) print('Saved: %s' % filename) def load_dataset(self, filename): # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) return loaded_model if __name__ == '__main__': _negative_docs = pre.CorpusFromDirectory('../txt_sentoken/neg', True) _positive_docs = pre.CorpusFromDirectory('../txt_sentoken/pos', True) pipeline = pre.Pipeline(pre.splitter.NLTK(), pre.tokenizer.Word(), pre.helper.StopwordFilter(file='../../stopwordsEng.txt'), pre.stemmer.Porter()) _neg_result = pipeline.processCorpus(_negative_docs) _pos_result = pipeline.processCorpus(_positive_docs) print('== Splitting Sentence + Tokenizing + Stopwords Removal + Stemming : Porter ==') print(_neg_result) print() negative_docs = list() for doc in _neg_result: new_doc = [] for sent in doc: for _str in sent: if len(_str) > 0: new_doc.append(_str) negative_docs.append(' '.join(new_doc))
def getSentimentScoreByFile(filePath, windowSize=1): if windowSize < 0: print("Wrong window size") exit(1) corpus = pre.CorpusFromFile(filePath) pipeline = pre.Pipeline( pre.splitter.NLTK(), # pre.tokenizer.WordPos(), pre.tokenizer.Word(), pre.helper.StopwordFilter(file='../stopwordsEng.txt'), pre.tagger.NLTK(), pre.lemmatizer.WordNet()) result = pipeline.processCorpus(corpus) print(result) EnglishDictionarySentimentAnalyzer().createDictionary() final_grand_score = 0 # file level final_count = 0 # file level final_score_array = [] # file level for document in result: convertedDocument = document # merge sentences in each document by window size if windowSize > 1: sentences = [] for sent in document: sentences.append(sent) if len(sentences) < windowSize: print("Window size is larger than the number of sentences") print( "Window size will be set as 1 (default) for this document") else: newArray = [] for a in range(0, len(sentences) - windowSize + 1): tempArray = [] for b in range(0, windowSize): for element in sentences[a + b]: tempArray.append(element) # print("tempArray: ", end="") # print(tempArray) newArray.append(tempArray) # print("newArray: ", end="") # print(newArray) convertedDocument = newArray grand_score, count = getSentimentScoreByDocument(convertedDocument) if count > 0: doc_avg_score = grand_score / count print("Average Sentiment Score: " + str(doc_avg_score)) final_grand_score += doc_avg_score final_count += 1 final_score_array.append(str(doc_avg_score)) else: print("This document is empty") try: final_avg_score = final_grand_score / final_count return str(final_avg_score), final_score_array except ZeroDivisionError: return str(0), ["This file is empty"]
def getKeywordSentimentScoreByFile(filePath, windowSize=1): if windowSize < 0: print("Wrong window size") exit(1) corpus = pre.CorpusFromFile(filePath) pipeline = pre.Pipeline( pre.splitter.NLTK(), # pre.tokenizer.WordPos(), pre.tokenizer.Word(), pre.helper.StopwordFilter(file='../stopwordsEng.txt'), pre.tagger.NLTK(), pre.lemmatizer.WordNet()) result = pipeline.processCorpus(corpus) #print(result) EnglishDictionarySentimentAnalyzer().createDictionary() final_grand_score = 0 # file level final_count = 0 # file level final_score_array = [] # file level for document in result: #convertedDocument = document # merge sentences in each document by window size # filter sentences by keyword if windowSize > 1: sentences = [] keyword = ["Korea", "Koreans", "Korean"] for sent in document: num = 0 sentences1 = [] while num < len(sent): k = 0 while k < len(keyword): if keyword[k] in sent[num]: if document.index(sent) > windowSize: sents = document[document.index(sent) - windowSize:document. index(sent) + windowSize] else: sents = document[0:document.index(sent) + windowSize] sentences1.extend(sentences) sentences.extend(sents) break else: k += 1 pass if sentences1 != sentences: break else: num += 1 pass #print("===sentences===") #print(sentences) convertedDocument = sentences grand_score, count = getSentimentScoreByDocument(convertedDocument) if count > 0: doc_avg_score = grand_score / count print("Average Sentiment Score: " + str(doc_avg_score)) final_grand_score += doc_avg_score final_count += 1 final_score_array.append(str(doc_avg_score)) else: print("This document is empty") try: final_avg_score = final_grand_score / final_count return str(final_avg_score), final_score_array except ZeroDivisionError: return str(0), ["This file is empty"]
#!/usr/bin/python3 # Author: Suzanna Sia ### Standard imports import numpy as np import pdb import os import sys import json import preprocess INDEX = "coe" pipe = preprocess.Pipeline() def docs_to_json(target_doc_fol): jsonl = [] for fil in os.listdir(target_doc_fol): dd = {} with open(os.path.join(target_doc_fol, fil), 'r') as f: text = f.readlines() fn = fil[:fil.find('.')] dd['_id'] = fn dd['doc_text'] = pipe.strip_clean(" ".join(text)) dd['docid'] = fn dd['_index'] = "coe" # refac jsonl.append(dd) return jsonl
file_name = './data/emo_positive.txt' sentiAnalyzer.readPositiveEmotiDictionary(file_name) file_name = './data/polarity.csv' sentiAnalyzer.readPolarityDictionary(file_name) dict_list = sentiAnalyzer.getSentiDictionary() pipeline = None # corpus = pre.CorpusFromFieldDelimitedFile('../data/donald.txt', 2) mecab_path = 'C:\\mecab\\mecab-ko-dic' mode = 'korean_lemmatizer' if mode is not 'korean_lemmatizer': pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.MeCab(mecab_path), # pre.tokenizer.Komoran(), pre.helper.SelectWordOnly(), pre.ngram.NGramTokenizer(1, 2, concat=' '), pre.helper.StopwordFilter(file='../stopwordsKor.txt')) else: pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.MeCab(mecab_path), # pre.tokenizer.Komoran(), pre.lemmatizer.SejongPOSLemmatizer(), pre.helper.SelectWordOnly(), # pre.ngram.NGramTokenizer(1, 2, concat=' ')), pre.helper.StopwordFilter(file='../stopwordsKor.txt')) # documents = ['오늘은 비가와서 그런지 매우 우울하다', # '시험이 끝나야 놀지 스트레스 받아ㅠㅠ',
return _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels if language == 'en': _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs \ = read_english_corpus() elif language == 'ko': _train_negative_docs, _train_positive_docs, _test_negative_docs, _test_positive_docs, labels \ = read_korean_corpus() if language == 'ko': mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = pre.Pipeline(pre.splitter.NLTK(), pre.tokenizer.MeCab(mecab_path), pre.helper.POSFilter('NN*'), pre.helper.SelectWordOnly(), pre.ngram.NGramTokenizer(1, 2), pre.helper.StopwordFilter(file='../../stopwordsKor.txt') ) elif language == 'en': pipeline = pre.Pipeline(pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.helper.POSFilter('NN*|A*|V*|J*'), pre.helper.SelectWordOnly(), # pre.ngram.NGramTokenizer(1, 2), pre.helper.StopwordFilter(file='../../stopwordsEng.txt') ) def make_documents(result): docs = []