import pyTextMiner as ptm _stopwords = [] with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file: for line in file: line = line.strip() #or some other preprocessing _stopwords.append(line) #storing everything in memory! path = 'C:\\mecab\\mecab-ko-dic' #pos_tagger_name - either komoran, okt, nltk #lang = ko or en pipeline = ptm.Pipeline( ptm.keyword.TextRankExtractor(pos_tagger_name='mecab', mecab_path=path, max=5, lang='ko', stopwords=_stopwords, combined_keywords=True)) corpus = ptm.CorpusFromFile('./data/sampleKor.txt') result = pipeline.processCorpus(corpus) print('== Splitting Sentence ==') print(result) print() from sklearn.datasets import fetch_20newsgroups ng20 = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) print("XXXX " + str(ng20.data[0]))
def preprocessing(self, mode, path, stopword_file, files, is_directory=False, doc_index=-1, max=-1): util = ptm.Utility() # mode is either filtered or unfiltered or simple corpus = [] if mode == 'unfiltered': # path = '/usr/local/lib/mecab/dic/mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: document = [] for sent in doc: for word in sent: document.append(word) self.documents.append(document) elif mode == 'filtered': pipeline = ptm.Pipeline(ptm.tokenizer.Word()) # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt') for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) self.documents = pipeline.processCorpus(corpus) elif mode == 'jamo_split_unfiltered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: for sent in doc: _sent = '' for word in sent: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'jamo_split_filtered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline(ptm.tokenizer.Word()) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index) elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file) elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: _sent = '' for word in doc: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'simple': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) count = 0 for line in open(files[0], encoding='utf-8'): if doc_index != -1: line = line.split()[doc_index] toks = line.split() if len(toks) > 10: self.documents.append(toks) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) print('Document size for the total dataset: ' + str(len(self.documents)))
import pyTextMiner as ptm #corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2) corpus = ptm.CorpusFromFile('./data/134963_norm.txt') # import nltk # nltk.download() # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) print(result) print() documents = [] for doc in result: document = '' for sent in doc: document = " ".join(sent) documents.append(document) #2016-10-20.txt corpus1 = ptm.CorpusFromFile('./data/2016-10-20.txt') noun_extractor = ptm.noun_extractor.NounExtractionKorean(corpus1) sent = '두바이월드센터시카고옵션거래소' result = noun_extractor.__call__(sent) print(result)
import pyTextMiner as ptm corpus=ptm.CorpusFromFile('./data/2016-10-20.txt') pmi=ptm.pmi.PMICalculator(corpus) sent='아이오아이' result=pmi.__call__(sent) print(result)
import os, subprocess from sklearn.feature_extraction.text import CountVectorizer import pyTextMiner as ptm mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) corpus = ptm.CorpusFromFile('./data/134963_norm.txt') result = pipeline.processCorpus(corpus) with open('processed_134963.txt', 'w', encoding='utf-8') as f_out: for doc in result: for sent in doc: new_sent = '' for word in sent: new_sent += word + ' ' new_sent = new_sent.strip() f_out.write(new_sent + "\n") f_out.close() file_path = 'D:\\python_workspace\\pyTextMiner\\processed_134963.txt' co = 'D:\\python_workspace\\pyTextMiner\\external_programs\\ccount.exe ' + "--input " + file_path + " --threshold " + str( 2) + " --output " + "co_result.txt" subprocess.run(co, shell=True) co_results = {}
import io from nltk.corpus import sentiwordnet as swn import nltk class EnglishDictionarySentimentAnalyzer: def __init__(self): name = 'EnglishDictionarySentimentAnalyzer' def createDictionary(self): nltk.download('sentiwordnet') if __name__ == '__main__': corpus = ptm.CorpusFromFile('./data/sampleEng.txt') pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'), ptm.tagger.NLTK(), ptm.lemmatizer.WordNet()) result = pipeline.processCorpus(corpus) EnglishDictionarySentimentAnalyzer().createDictionary() for doc in result: for sent in doc: for _str in sent: _str[0] _str[1] pos = ''
result = lsi.print_topics(5, 20) for a_topic in result: print("LSI results " + str(a_topic)) corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly #print(doc) if __name__ == '__main__': import pyTextMiner as ptm import io import nltk corpus = ptm.CorpusFromFile('../donald.txt') pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='../stopwordsKor.txt'), ptm.ngram.NGramTokenizer(3)) result = pipeline.processCorpus(corpus) id = 0 text_data = [] for doc in result: new_doc = [] for sent in doc: for _str in sent: if len(_str) > 0:
mecab_path = 'C:\\mecab\\mecab-ko-dic' # stopwords file path stopwords = '../stopwords/stopwordsKor.txt' # train documents input path input_path = '../data/donald.txt' # output base directory output_base_dir = './tmp' pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(mecab_path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopwords)) corpus = ptm.CorpusFromFile(input_path) documents = [] result = pipeline.processCorpus(corpus) i = 0 for doc in result: document = [] for sent in doc: for word in sent: document.append(word) documents.append(TaggedDocument(document, [i])) i += 1 #--epochs 40 --vocab-min-count 10 data/stopwords_german.txt dewiki-preprocessed.txt /tmp/models/doc2vec-dewiki doc2vec = Doc2VecTrainer() logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s',