Beispiel #1
0
import pyTextMiner as ptm

_stopwords = []
with open("./stopwords/stopwordsKor.txt", encoding='utf-8') as file:
    for line in file:
        line = line.strip()  #or some other preprocessing
        _stopwords.append(line)  #storing everything in memory!

path = 'C:\\mecab\\mecab-ko-dic'
#pos_tagger_name - either komoran, okt, nltk
#lang = ko or en
pipeline = ptm.Pipeline(
    ptm.keyword.TextRankExtractor(pos_tagger_name='mecab',
                                  mecab_path=path,
                                  max=5,
                                  lang='ko',
                                  stopwords=_stopwords,
                                  combined_keywords=True))

corpus = ptm.CorpusFromFile('./data/sampleKor.txt')
result = pipeline.processCorpus(corpus)
print('== Splitting Sentence ==')
print(result)
print()

from sklearn.datasets import fetch_20newsgroups
ng20 = fetch_20newsgroups(subset='all',
                          remove=('headers', 'footers', 'quotes'))

print("XXXX " + str(ng20.data[0]))
Beispiel #2
0
    def preprocessing(self,
                      mode,
                      path,
                      stopword_file,
                      files,
                      is_directory=False,
                      doc_index=-1,
                      max=-1):
        util = ptm.Utility()
        # mode is either filtered or unfiltered or simple
        corpus = []
        if mode == 'unfiltered':
            # path = '/usr/local/lib/mecab/dic/mecab-ko-dic'
            pipeline = ptm.Pipeline(
                ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path),
                ptm.lemmatizer.SejongPOSLemmatizer(),
                ptm.helper.SelectWordOnly(),
                ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    document = []
                    for sent in doc:
                        for word in sent:
                            document.append(word)
                    self.documents.append(document)

        elif mode == 'filtered':
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            # corpus = ptm.CorpusFromFile('/Data/ko_sns_comments/naver_comments15_16_filtered.txt')
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break
                        corpus = ptm.Corpus(docs)

            self.documents = pipeline.processCorpus(corpus)

        elif mode == 'jamo_split_unfiltered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(
                ptm.splitter.KoSentSplitter(), ptm.tokenizer.MeCab(path),
                ptm.lemmatizer.SejongPOSLemmatizer(),
                ptm.helper.SelectWordOnly(),
                ptm.helper.StopwordFilter(file=stopword_file))

            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index).docs
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file).docs
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    for sent in doc:
                        _sent = ''
                        for word in sent:
                            _sent += word + ' '
                        _sent = _sent.strip()
                        _sent = util.jamo_sentence(_sent)
                        toks = _sent.split()
                        if len(toks) > 10:
                            self.documents.append(toks)

        elif mode == 'jamo_split_filtered':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            pipeline = ptm.Pipeline(ptm.tokenizer.Word())
            for a_file in files:
                if is_directory == True and max == -1:
                    corpus += ptm.CorpusFromDirectory(a_file).docs
                elif is_directory == False and doc_index != -1 and max == -1:
                    corpus += ptm.CorpusFromFieldDelimitedFile(
                        a_file, doc_index)
                elif is_directory == False and doc_index == -1 and max == -1:
                    corpus += ptm.CorpusFromFile(a_file)
                elif is_directory == False and max > 0:
                    count = 0
                    docs = []
                    for line in open(a_file):
                        if doc_index != -1:
                            line = line.split()[doc_index]
                        if len(line) < 1:
                            continue
                        toks = line.split()
                        if len(toks) > 10:
                            docs.append(line)
                            count += 1
                        if count % 10000 == 0:
                            print('processing... ' + str(count))
                        if max < count:
                            break

                        corpus = ptm.Corpus(docs)

            if type(corpus) != list and len(corpus.docs) > 0 or type(
                    corpus) == list and len(corpus) > 0:
                result = pipeline.processCorpus(corpus)
                for doc in result:
                    _sent = ''
                    for word in doc:
                        _sent += word + ' '
                    _sent = _sent.strip()
                    _sent = util.jamo_sentence(_sent)
                    toks = _sent.split()
                    if len(toks) > 10:
                        self.documents.append(toks)

        elif mode == 'simple':
            # documents = LineSentence(datapath('/Data/ko_sns_comments/naver_comments15_16_filtered.txt'))
            count = 0
            for line in open(files[0], encoding='utf-8'):
                if doc_index != -1:
                    line = line.split()[doc_index]
                toks = line.split()
                if len(toks) > 10:
                    self.documents.append(toks)
                    count += 1

                if count % 10000 == 0:
                    print('processing... ' + str(count))

        print('Document size for the total dataset: ' +
              str(len(self.documents)))
import pyTextMiner as ptm

#corpus = ptm.CorpusFromFieldDelimitedFile('./data/donald.txt', 2)
corpus = ptm.CorpusFromFile('./data/134963_norm.txt')
# import nltk
# nltk.download()
# 단어 단위로 분리했으니 이제 stopwords를 제거하는게 가능합니다. ptm.helper.StopwordFilter를 사용하여 불필요한 단어들을 지워보도록 하겠습니다.
# 그리고 파이프라인 뒤에 ptm.stemmer.Porter()를 추가하여 어근 추출을 해보겠습니다.
# 한번 코드를 고쳐서 ptm.stemmer.Lancaster()도 사용해보세요. Lancaster stemmer가 Porter stemmer와 어떻게 다른지 비교하면 재미있을 겁니다.
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.Komoran(), ptm.helper.POSFilter('NN*'),
    ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))
result = pipeline.processCorpus(corpus)
print(result)
print()

documents = []
for doc in result:
    document = ''
    for sent in doc:
        document = " ".join(sent)
    documents.append(document)

#2016-10-20.txt
corpus1 = ptm.CorpusFromFile('./data/2016-10-20.txt')
noun_extractor = ptm.noun_extractor.NounExtractionKorean(corpus1)
sent = '두바이월드센터시카고옵션거래소'
result = noun_extractor.__call__(sent)
print(result)
Beispiel #4
0
import pyTextMiner as ptm

corpus=ptm.CorpusFromFile('./data/2016-10-20.txt')
pmi=ptm.pmi.PMICalculator(corpus)
sent='아이오아이'
result=pmi.__call__(sent)
print(result)
Beispiel #5
0
import os, subprocess

from sklearn.feature_extraction.text import CountVectorizer

import pyTextMiner as ptm

mecab_path = 'C:\\mecab\\mecab-ko-dic'
pipeline = ptm.Pipeline(
    ptm.splitter.NLTK(), ptm.tokenizer.MeCab(mecab_path),
    ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(),
    ptm.helper.StopwordFilter(file='./stopwords/stopwordsKor.txt'))

corpus = ptm.CorpusFromFile('./data/134963_norm.txt')
result = pipeline.processCorpus(corpus)

with open('processed_134963.txt', 'w', encoding='utf-8') as f_out:
    for doc in result:
        for sent in doc:
            new_sent = ''
            for word in sent:
                new_sent += word + ' '
            new_sent = new_sent.strip()
            f_out.write(new_sent + "\n")
f_out.close()

file_path = 'D:\\python_workspace\\pyTextMiner\\processed_134963.txt'
co = 'D:\\python_workspace\\pyTextMiner\\external_programs\\ccount.exe ' + "--input " + file_path + " --threshold " + str(
    2) + " --output " + "co_result.txt"

subprocess.run(co, shell=True)
co_results = {}
Beispiel #6
0
import io
from nltk.corpus import sentiwordnet as swn
import nltk


class EnglishDictionarySentimentAnalyzer:
    def __init__(self):
        name = 'EnglishDictionarySentimentAnalyzer'

    def createDictionary(self):
        nltk.download('sentiwordnet')


if __name__ == '__main__':

    corpus = ptm.CorpusFromFile('./data/sampleEng.txt')
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(), ptm.tokenizer.Word(),
        ptm.helper.StopwordFilter(file='./stopwords/stopwordsEng.txt'),
        ptm.tagger.NLTK(), ptm.lemmatizer.WordNet())

    result = pipeline.processCorpus(corpus)

    EnglishDictionarySentimentAnalyzer().createDictionary()

    for doc in result:
        for sent in doc:
            for _str in sent:
                _str[0]
                _str[1]
                pos = ''
Beispiel #7
0
        result = lsi.print_topics(5, 20)
        for a_topic in result:
            print("LSI results " + str(a_topic))

        corpus_lsi = lsi[
            corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
        #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
        #print(doc)


if __name__ == '__main__':
    import pyTextMiner as ptm
    import io
    import nltk

    corpus = ptm.CorpusFromFile('../donald.txt')
    pipeline = ptm.Pipeline(
        ptm.splitter.NLTK(), ptm.tokenizer.Komoran(),
        ptm.helper.POSFilter('NN*'), ptm.helper.SelectWordOnly(),
        ptm.helper.StopwordFilter(file='../stopwordsKor.txt'),
        ptm.ngram.NGramTokenizer(3))

    result = pipeline.processCorpus(corpus)

    id = 0
    text_data = []
    for doc in result:
        new_doc = []
        for sent in doc:
            for _str in sent:
                if len(_str) > 0:
Beispiel #8
0
    mecab_path = 'C:\\mecab\\mecab-ko-dic'

    # stopwords file path
    stopwords = '../stopwords/stopwordsKor.txt'
    # train documents input path
    input_path = '../data/donald.txt'
    # output base directory
    output_base_dir = './tmp'

    pipeline = ptm.Pipeline(ptm.splitter.KoSentSplitter(),
                            ptm.tokenizer.MeCab(mecab_path),
                            ptm.lemmatizer.SejongPOSLemmatizer(),
                            ptm.helper.SelectWordOnly(),
                            ptm.helper.StopwordFilter(file=stopwords))

    corpus = ptm.CorpusFromFile(input_path)
    documents = []
    result = pipeline.processCorpus(corpus)
    i = 0
    for doc in result:
        document = []
        for sent in doc:
            for word in sent:
                document.append(word)
        documents.append(TaggedDocument(document, [i]))
        i += 1

    #--epochs 40 --vocab-min-count 10 data/stopwords_german.txt dewiki-preprocessed.txt /tmp/models/doc2vec-dewiki

    doc2vec = Doc2VecTrainer()
    logging.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s',