def process_by_date_range(startDate, endDate): filter_by_date_range(startDate, endDate) user_dict = './user_dic.txt' # TODO: Customize pre-processing pipeline pipeline = pre.Pipeline(pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.lemmatizer.WordNet(), pre.helper.POSFilter('N*|J*|R*|V*'), pre.helper.SelectWordOnly(), pre.helper.StopwordFilter(file='./stopwordsEng.txt'), pre.ngram.NGramTokenizer(1, 2), pre.counter.WordCounter()) filePath1 = "./data/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_data.txt" corpus = pre.CorpusFromFieldDelimitedFile(filePath1, 0) if os.path.exists(filePath1): # os.remove(filePath1) print(filePath1, "is now being processed!\n") else: print("File does not exist!") result = pipeline.processCorpus(corpus) print(result) print() doc_collection = '' term_counts = {} for doc in result: for sent in doc: for _str in sent: term_counts[_str[0]] = term_counts.get(_str[0], 0) + int(_str[1]) freq = range(int(_str[1])) co = '' for n in freq: co += ' ' + _str[0] doc_collection += ' ' + co word_freq = [] for key, value in term_counts.items(): word_freq.append((value, key)) word_freq.sort(reverse=True) print(word_freq) filePath2 = "./result/date_from" + startDate.strftime("%Y%m%d") + "to" + endDate.strftime("%Y%m%d") + "_result.txt" f = open(filePath2, "w", encoding='utf8') for pair in word_freq: f.write(pair[1] + '\t' + str(pair[0]) + '\n') f.close() return doc_collection
# testCooccurrence.py modified # -*- encoding:utf8 -*- import preprocess as pre import networkx as nx from matplotlib import pyplot as plt import numpy as np from sklearn.feature_extraction.text import CountVectorizer import matplotlib as mpl if __name__ == '__main__': corpus = pre.CorpusFromFieldDelimitedFile('../data/ALLre_date_content.txt', 1) pipeline = pre.Pipeline( pre.splitter.NLTK(), pre.tokenizer.WordPos(), pre.lemmatizer.WordNet(), pre.helper.POSFilter('N*', 'V*', 'J*'), pre.helper.SelectWordOnly(), pre.helper.StopwordFilter(file='../stopwordsEng.txt')) # pre.ngram.NGramTokenizer(1, 2)) result = pipeline.processCorpus(corpus) print('== 전처리 완료 ==') print(result) print() file = open('ALLre_all_AVJ_pre.txt', 'w') file.write(result) file.close() print('== ==')