def vectorizeCaseTwo(): corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt',2) pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(2, 2), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') ) result = pipeline.processCorpus(corpus) print('== 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 빈도 분석 ==') print(result) print() print('== ==') documents = [] for doc in result: document = '' for sent in doc: document += " ".join(sent) documents.append(document) vectorizer = CountVectorizer() X = vectorizer.fit_transform(documents) print(vectorizer.get_feature_names()) print(X.shape) print(X.toarray()) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(documents) print(vectorizer.get_feature_names()) print(len(vectorizer.get_feature_names())) print(X.toarray())
import pyTextMiner as ptm _stopwords = [] with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file: for line in file: line = line.strip() #or some other preprocessing _stopwords.append(line) #storing everything in memory! path = 'C:\\mecab\\mecab-ko-dic' #pos_tagger_name - either komoran, okt, nltk #lang = ko or en pipeline = ptm.Pipeline( ptm.keyword.TextRankExtractor(pos_tagger_name='mecab', mecab_path=path, max=5, lang='ko', stopwords=_stopwords, combined_keywords=True)) corpus = ptm.CorpusFromFile('./data/sampleKor.txt') result = pipeline.processCorpus(corpus) print('== Splitting Sentence ==') print(result) print() from sklearn.datasets import fetch_20newsgroups ng20 = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) print("XXXX " + str(ng20.data[0]))
import pyTextMiner as ptm #corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2) corpus = ptm.CorpusFromFile('./data/134963_norm.txt') # import nltk # nltk.download() # 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다. # 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다. # 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다. pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) print(result) print() documents = [] for doc in result: document = '' for sent in doc: document = " ".join(sent) documents.append(document) #2016-10-20.txt corpus1 = ptm.CorpusFromFile('./data/2016-10-20.txt') noun_extractor = ptm.noun_extractor.NounExtractionKorean(corpus1) sent = '두바이월드센터시카고옵션거래소' result = noun_extractor.__call__(sent) print(result)
from document_classification.ml_textclassification import documentClassifier import pyTextMiner as ptm if __name__ == '__main__': document_classifier = documentClassifier() mecab_path = 'C:\\mecab\\mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), ptm.ngram.NGramTokenizer(2, 2), #ptm.tokenizer.LTokenizerKorean(), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')) #mode is either train or predict mode = 'train' if mode is 'train': input_file = './data/3_class_naver_news.csv' # 1. text processing and representation corpus = ptm.CorpusFromFieldDelimitedFileForClassification( input_file, delimiter=',', doc_index=4, class_index=1, title_index=3) corpus.docs tups = corpus.pair_map class_list = [] for id in tups: #print(tups[id])
test_sample = '한국 경제가 위기에 처하다' # Convert the sample document into a list and use the infer_vector method to get a vector representation for it new_doc_words = test_sample.split() similars = doc2vec.most_similar(test_sample) for sim in similars: print(str(sim)) mecab_path = 'C:\\mecab\\mecab-ko-dic' # stopwords file path stopwords = '../stopwords/stopwordsKor.txt' test_sample1 = '중국 시장은 위축되었다' pipeline = ptm.Pipeline(ptm.tokenizer.MeCab(mecab_path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopwords)) doc_vec1 = pipeline.processCorpus([test_sample]) doc_vec2 = pipeline.processCorpus([test_sample1]) print(doc_vec1[0]) print(doc_vec2[0]) # use the most_similar utility to find the most similar documents. similarity = doc2vec.compute_similarity_vec(first_vec=doc_vec1[0], second_vec=doc_vec2[0]) print('similarity between two document: ') print(str(similarity))
def preprocessing(self, mode, path, stopword_file, files, is_directory=False, doc_index=-1, max=-1): util = ptm.Utility() # mode is either filtered or unfiltered or simple corpus = [] if mode == 'unfiltered': # path = '/usr/local/lib/mecab/dic/mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: document = [] for sent in doc: for word in sent: document.append(word) self.documents.append(document) elif mode == 'filtered': pipeline = ptm.Pipeline(ptm.tokenizer.Word()) # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt') for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) self.documents = pipeline.processCorpus(corpus) elif mode == 'jamo_split_unfiltered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file=stopword_file)) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index).docs elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file).docs elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: for sent in doc: _sent = '' for word in sent: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'jamo_split_filtered': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) pipeline = ptm.Pipeline(ptm.tokenizer.Word()) for a_file in files: if is_directory == True and max == -1: corpus += ptm.CorpusFromDirectory(a_file).docs elif is_directory == False and doc_index != -1 and max == -1: corpus += ptm.CorpusFromFieldDelimitedFile( a_file, doc_index) elif is_directory == False and doc_index == -1 and max == -1: corpus += ptm.CorpusFromFile(a_file) elif is_directory == False and max > 0: count = 0 docs = [] for line in open(a_file): if doc_index != -1: line = line.split()[doc_index] if len(line) < 1: continue toks = line.split() if len(toks) > 10: docs.append(line) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) if max < count: break corpus = ptm.Corpus(docs) if type(corpus) != list and len(corpus.docs) > 0 or type( corpus) == list and len(corpus) > 0: result = pipeline.processCorpus(corpus) for doc in result: _sent = '' for word in doc: _sent += word + ' ' _sent = _sent.strip() _sent = util.jamo_sentence(_sent) toks = _sent.split() if len(toks) > 10: self.documents.append(toks) elif mode == 'simple': # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')) count = 0 for line in open(files[0], encoding='utf-8'): if doc_index != -1: line = line.split()[doc_index] toks = line.split() if len(toks) > 10: self.documents.append(toks) count += 1 if count % 10000 == 0: print('processing... ' + str(count)) print('Document size for the total dataset: ' + str(len(self.documents)))
import pyTextMiner as ptm dictionary_path='./dict/user_dic.txt' pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(userdic=dictionary_path), ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(), #ptm.tokenizer.MaxScoreTokenizerKorean(), #ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) #ptm.ngram.NGramTokenizer(2,3), #ptm.counter.WordCounter()) corpus = ptm.CorpusFromEojiFile('./data/filtered_content.txt') #result = pipeline.processCorpus(corpus) #print(result) print() import numpy as np print(np.__version__) s = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습." pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter()) corpus = [s] result = pipeline.processCorpus(corpus) print(result)
import pyTextMiner as ptm #model Google News, run once to download pre-trained vectors #!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz model = gensim.models.KeyedVectors.load_word2vec_format( '../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True) # Fetch ng20 dataset ng20 = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) # text and ground truth labels texts, y = ng20.data, ng20.target #corpus = [preprocess(text) for text in texts] pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='../stopwords/stopwordsEng.txt'), ptm.stemmer.Porter()) result = pipeline.processCorpus(texts) corpus = [] for doc in result: document = [] for sent in doc: for word in sent: document.append(word) corpus.append(document) # ### Remove empty docs def filter_docs(corpus, texts, labels, condition_on_doc): """ Filter corpus, texts and labels given the function condition_on_doc which takes
import multiprocessing from time import time import gensim import pyTextMiner as ptm from gensim.models import Word2Vec cores = multiprocessing.cpu_count() # Count the number of cores in a computer print('Start reading the dataset 1....') path = '/usr/local/lib/mecab/dic/mecab-ko-dic' pipeline = ptm.Pipeline( ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) corpus = ptm.CorpusFromFieldDelimitedEmojiFile('/Data/ko_sns_comments/xab', 1) result1 = pipeline.processCorpus(corpus) print('Finish processing... ') i = 0 file = open("naver_comments15_16_filtered.txt", "a+") for doc in result1: if i % 10000 == 0: print('processing ' + str(i)) i += 1 document = '' for sent in doc: for word in sent:
import nltk class EnglishDictionarySentimentAnalyzer: def __init__(self): name = 'EnglishDictionarySentimentAnalyzer' def createDictionary(self): nltk.download('sentiwordnet') if __name__ == '__main__': corpus = ptm.CorpusFromFile('./data/sampleEng.txt') pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'), ptm.tagger.NLTK(), ptm.lemmatizer.WordNet()) result = pipeline.processCorpus(corpus) EnglishDictionarySentimentAnalyzer().createDictionary() for doc in result: for sent in doc: for _str in sent: _str[0] _str[1] pos = '' if (str(_str[1]).startswith("N")): pos = 'n' elif (str(_str[1]).startswith("A")):
file_name = './data/emo_positive.txt' sentiAnalyzer.readPositiveEmotiDictionary(file_name) file_name = './data/polarity.csv' sentiAnalyzer.readPolarityDictionary(file_name) dict_list = sentiAnalyzer.getSentiDictionary() pipeline = None corpus = ptm.CorpusFromFieldDelimitedFile('../data/donald.txt', 2) mecab_path = 'C:\\mecab\\mecab-ko-dic' mode = 'korean_lemmatizer' if mode is not 'korean_lemmatizer': pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), #ptm.tokenizer.Komoran(), ptm.helper.SelectWordOnly(), #ptm.ngram.NGramTokenizer(1,2,concat=' '), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')) else: pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path), #ptm.tokenizer.Komoran(), ptm.lemmatizer.SejongPOSLemmatizer(), ptm.helper.SelectWordOnly(), #ptm.ngram.NGramTokenizer(1, 2, concat=' '), ptm.helper.StopwordFilter(file='../stopwords/stopwordsKor.txt')) documents = [ '오늘은 비가와서 그런지 매우 우울하다',
def preprocessing_english(): tweet_raw = pd.read_csv('./SuperbowlData/클린하고유저거름/user정리_{}_2019_09_23_to_2019_10_02.csv'.format(brand), encoding = 'utf-8', header = 0) #cols = ['sentiment', 'id', 'date', 'query_string', 'user', 'text'] #this is for sentiment140 preprocess #colds = ['date','time','user_name','text','link','retweet_counts','favorite_counts'] #this is for collected tweets data via GOT3 #tweet_raw = pd.read_csv("./data/sentiment140_training.1600000.processed.noemoticon.csv", encoding = 'latin-1', header = None, names = cols) #this is for sentiment140 data preprocess #tweet_raw = pd.read_csv("./SuperbowlData/클린하고유저거름/필터_clear_Kia_twitter_data_2019-01-31_to_2019-02-07.csv", encoding = 'utf-8', header = 0) print(tweet_raw['text'][:5]) #tweet_raw = tweet_raw[0:5] #for test corpus = [] for i in range(len(tweet_raw)): doc = str(tweet_raw['text'][i]) doc = doc.replace("[", "").replace("]", "").replace("%", "").replace("? 셳", "'t").replace("?셳", "'t").replace("? 셲", "'s").replace("?셲", "'s")\ .replace("? 쁥", "'h").replace("?쁥", "'h").replace("? 쁲", "'s").replace("?쁲", "'s").replace("? 셱", "'r").replace("?셱", "'r").replace("? 쁳", "'t").replace("?쁳", "'t")\ .replace("? 셫", "'m").replace("?셫", "'m").replace("? 쁶", "'w").replace("?쁶", "'w").replace("? 쐏", "'p").replace("?쐏", "'p").replace("? 쐌", "'M").replace("?쐌", "'M")\ .replace("? 셙", "'a").replace("?셙", "'a").replace("? 쏧", "'I").replace("?쏧", "'I").replace("훮", "ā").replace("? 셶", "'v").replace("?셶", "'v")\ .replace("? 쏷", "'T").replace("?쏷", "'T").replace("? 쏝", "'B").replace("?쏝", "'B").replace("? 셪", "'l").replace("?셪", "'l").replace("? 쐙", "'y").replace("?쐙", "'y")\ .replace("짙", "£").replace("?쏪", "'J").replace("챕", "é").replace("? 쏻", "'W").replace("?쏻", "'W").replace("? 쐓", "'S").replace("?쐓", "'S")\ .replace("훮", "ā").replace("? 쐁", "'C").replace("?쐁", "'C").replace("竊쉎", ": h").replace("竊", "(").replace("? 쐌", "'m").replace("?쐌", "'m").\ replace("? 쒴", "'K").replace("?쒴", "'K").replace("? 쐆", "'h").replace("?쐆", "'h").replace("? 셎", "'S").replace("?셎", "'S").replace("? 쁅", "'F").replace("?쁅", "'F")\ .replace("? 쐔", "'T").replace("?쐕", "'T").replace("?죛", " s").replace("?쒋?", "'' ").replace("? 쏞", "'C").replace("?쏞", "'C").replace("? 쏱", "'P").replace("?쏱", "'P")\ .replace("? 셝", "'d").replace("?셝", "'d").replace("? 쏽", "'Y").replace("?쏽", "'Y").replace("? 쏫", "'K").replace("?쏫","'K").replace("? 쏤", "'F").replace("?쏤", "'F")\ .replace("? 쏦", "'H").replace("?쏦", "'H").replace("&", "and").replace("? 쏺", "'V").replace("?쏺", "'V")\ .replace("? 쏛", "'A").replace("?쏛", "'A").replace("? 쏡", "'S").replace("?쏡", "'S").replace("? 쐍", "'n").replace("?쐍", "'n").replace("?㏇뇰?▧?", "").replace("? 쐊", "'k").replace("?쐊", "'k") doc = re.sub("@[\d|A-Z|a-z|_.]+", "", doc) #사용자태그 삭제 doc = re.sub("(http|https|ftp|telnet|news|mms)://[^\"'\s()]+", "", doc) #url 삭제 doc = doc.replace("'ve", " have").replace("'s", " is").replace("n't", " not").replace("'m", " am").replace("'ll", " will").replace("'d", "would") doc = doc.lower() #doc = re.sub("[^a-zA-Z]", " ", doc) #특수문자 삭제 doc = re.sub("[^A-Za-z.?!\s]", " ", doc) corpus.append("{}".format(doc)) # .split(".")) pipeline1 = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter(file='./stopwordsEng.txt'), ptm.tagger.NLTK(), ptm.lemmatizer.WordNet(), ptm.helper.SelectWordOnly()) ##, kp.ngram.NGramTokenizer()) #Below: LDA를 위한 다른 파이프라인... pipeline2 = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Word(), ptm.helper.StopwordFilter('./data/english_stopwords.txt'), ptm.tagger.NLTK(), ptm.lemmatizer.WordNet(), ptm.helper.POSFilter('N*', 'J*'), ptm.helper.SelectWordOnly()) result1 = pipeline1.processCorpus(corpus) #result2 = pipeline2.processCorpus(corpus) f_output = open('./전처리/preprocessed/전처리최종_앱티브_2019{}.csv'.format(brand), 'w', encoding='utf-8', newline='') csv_writer = csv.writer(f_output) # quoting = csv.QUOTE_ALL) #csv_writer.writerow(['sentiment', 'id', 'date', 'query_string', 'user', 'text']) for i, doc in enumerate(result1): #doc = re.sub('[\W]', '', doc) #특수문자 삭제 # Remove punctuations and numbers #doc = re.sub('[^a-zA-Z]', ' ', doc) # Single character removal #doc = re.sub(r"\s+[a-zA-Z]\s+", ' ', doc) # 길이가 2이하인 단어는 제거 (길이가 짧은 단어 제거) ''' for w in doc: w = re.sub('[^a-zA-Z]', ' ', w) w = re.sub(r"\s+[a-zA-Z]\s+", ' ', w) if len(w)>2 : doc = doc.append(' '.join(w)) ''' #print(i, doc) sent = list(map(" ".join, doc)) #csv_writer.writerow([tweet_raw['sentiment'][i], tweet_raw['id'][i], tweet_raw['date'][i], tweet_raw['query_string'][i], tweet_raw['user'][i],"{}".format(" ".join(sent))]) #not this. don't use csv_writer.writerow([tweet_raw['date'][i], tweet_raw['time'][i], tweet_raw['user_name'][i], " ".join(sent)])
# ptm.tokenizer.Komoran나 ptm.tokenizer.TwitterKorean을 사용해 형태소 분석이 가능합니다. # 형태소 분석 이후 품사가 NN으로 시작하는 명사들만 추출하고, 단어만 골라내 출력하도록 해봅시다. #import nltk #nltk.download('punkt') #pipeline = ptm.Pipeline(ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), # ptm.helper.POSFilter('NN*'), # ptm.helper.SelectWordOnly(), # ptm.ngram.NGramTokenizer(3), # ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt') # ) pipeline = ptm.Pipeline( ptm.splitter.NLTK(), ptm.segmentation.SegmentationKorean( './model/korean_segmentation_model.crfsuite'), ptm.ngram.NGramTokenizer(3), ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt')) result = pipeline.processCorpus(corpus) with io.open("demofile.csv", 'w', encoding='utf8') as f: for doc in result: for sent in doc: f.write('\t'.join(sent) + "\n") print('== 문장 분리 + 형태소 분석 + 명사만 추출 + 단어만 보여주기 + 구 추출 ==') print(result) print()