def setUp(self): nltk.download("gutenberg") self.docs = [] for fid in gutenberg.fileids(): f = gutenberg.open(fid) self.docs.append(f.read()) f.close()
c = kolaw.open(kolaw.fileids()[0]).read() # 파일포인터를 통해 첫번째 파일 오픈 print(len(c)) # 18884개의 character를 갖고 있음. print(len(c.split())) # 몇 개의 어절이 있는지 확인해보기(단순 띄어쓰기로 세었기때문에 중복 허용.) (4178개/정식 corpus는 보통 100만~1000만 단위의 어절 제공.) print(len(c.splitlines())) # 몇 개의 엔터가 들어가 있는지 확인 d = kobill.open(kobill.fileids()[0]).read() print(d.splitlines()[:2]) # 처음 두 요소만 출력 # ------------------------------------------------------------------------------------------------------------------------------------------- # ------------------------------- NLTK 말뭉치 사용해보기(brown, gutenberg corpus) ---------------------------------------- print(len(brown.fileids())) a = brown.open(brown.fileids()[0]).read() print(len(a), len(a.split()), len(a.splitlines()), a.splitlines()[:3]) b = gutenberg.open(gutenberg.fileids()[0]).read() print(len(b), len(b.split()), len(b.splitlines()), b.splitlines()[:3]) # ------------------------------------------------------------------------------------------------------------------------ # ------------------------------------------- Tokenize 해보기 ------------------------------------------------------- s = sent_tokenize(b) # 공식적으로 10개의 언어를 지원하지만 한국어, 일본어, 중국어는 없다. 구두점을 기준으로 분석. print(len(s), len(b.splitlines())) print(s[:3], b.splitlines()[:3]) print(sent_tokenize("Hello world, Hello world! Hello........?")) print(sent_tokenize("집에 가고?싶다.....")) # 구두점 다음에 스페이스가 있으면 문장의 경계로 인식.(없으면 하나의 문장으로 인식.) 즉 구두점 다음에 스페이스가 있어야 한다. print(word_tokenize(d)) # 어절 단위로 인식했으나 띄어쓰기 단위는 아님 print(word_tokenize("10분만 버티자 :) ")) # ':)'는 트위터 등과 같이 몇자 안되는 글자에서 감정을 표현하는 중요한 수단이므로 어절 단위로 분리되면 안된다. # 따라서 TweetTokenizer라는 모듈이 제공되며, word_tokenize와 특징이 다르므로 인스턴스로 받아서 사용해야 함. print(TweetTokenizer().tokenize("10분만 버티자 :)")) # ':)'도 분류가 잘 되는 것을 확인할 수 있음.
from nltk import sent_tokenize, word_tokenize #stopwords from nltk.corpus import stopwords #K Fold Cross Validation from sklearn import model_selection from sklearn.linear_model import LogisticRegression #Import svm model from sklearn.svm import SVC import warnings warnings.filterwarnings("ignore", category=FutureWarning) #import 3 Gutenberg Books from nltk.corpus import gutenberg files_en = gutenberg.fileids() # Get file ids emma_en = gutenberg.open('austen-emma.txt').read() sense_en = gutenberg.open('austen-sense.txt').read() brown_en = gutenberg.open('chesterton-brown.txt').read() def tokenize(data): tokenized_word = word_tokenize(data) tokenized_word = [w for w in tokenized_word if w.isalpha()] tokenized_word = [w.lower() for w in tokenized_word] stop_words = stopwords.words('english') tokenized_word = [ w for w in tokenized_word if not w in stop_words and len(w) >= 3 ] tokenized_word_list = [] start = 0
from nltk.corpus import gutenberg import matplotlib.pyplot as plt # % matplotlib inline bible = gutenberg.open('bible-kjv.txt') bible = bible.readlines() res = bible[:5] print(res) # sentence = "I love coding on python, because it gives me and enormous ability to use the Data processing!"
tfidf.fit(docs) lower_idf = np.percentile(tfidf.idf_, lower_idf_limit) upper_idf = np.percentile(tfidf.idf_, upper_idf_limit) # Pick out the vocab to be dropped drop_vocab = set( term for term, idx in tfidf.vocabulary_.items() if tfidf.idf_[idx] < lower_idf or tfidf.idf_[idx] >= upper_idf) # Filter the documents new_docs = [] for doc in documents: _new_doc = [] for sent in doc: _new_sent = [w for w in sent if w not in drop_vocab] if len(_new_sent) == 0: continue _new_doc.append(_new_sent) new_docs.append(_new_doc) return new_docs if __name__ == '__main__': nltk.download("gutenberg") from nltk.corpus import gutenberg docs = [] for fid in gutenberg.fileids(): f = gutenberg.open(fid) docs.append(f.read()) f.close() docs = [tokenize_document(d) for d in docs] docs = filter_by_idf(docs, 10, 90)
# coding=UTF-8 #import nltk #nltk.download() import csv import pandas as pd from nltk.corpus import gutenberg # Docs from project gutenberg.org #from scrapy.item import Field files_en = gutenberg.fileids() # Get file ids doc_en = gutenberg.open('C:\\Python27\\NLPK\\pg158.txt').read() from nltk import regexp_tokenize pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]''' tokens_en = regexp_tokenize(doc_en, pattern) import nltk en = nltk.Text(tokens_en) # CSV Field # import csv import nltk import os.path import sys ### read csv
def get_gutenberg(): return gutenberg.open(gutenberg.fileids()[0]).read()
BA_04192018 Text Mining @author: Justin """ import nltk nltk.download() # stopwords collection은 usually 텍스트마이닝에서 제외하는 단어를 모아놨다. # nltk lemmatization 은 worknet을 기반으로 분석한다. from nltk.corpus import gutenberg ids = gutenberg.fileids() # 옛것이라 라이센스 없어서 괜찮음 ㅎ text = gutenberg.open(ids[0]).read() # emma 로 분석을 시작해보자. nltk.download('punkt') from nltk import word_tokenize tokens = word_tokenize(text) tokens[:100] en = nltk.Text(tokens) #tokens = en.tokens # 모든 character를 나눈다. nltk.Text에 text를 넣으면. dic = en.vocab() en.plot(50) lower_tokens = [x.lower() for x in tokens] # 모든 character를 lower case로. en_lw = nltk.Text(lower_tokens) dic_lw = en_lw.vocab()
from nltk.corpus import gutenberg # Docs from project gutenberg.org files_en = gutenberg.fileids() # Get file ids doc_en = gutenberg.open('austen-emma.txt').read() from nltk import regexp_tokenize pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]''' tokens_en = regexp_tokenize(doc_en, pattern) #nltk.download('gutenberg') import nltk en = nltk.Text(tokens_en) print(len(en.tokens)) # returns number of tokens (document length) print(len(set(en.tokens))) # returns number of unique tokens en.vocab() #en.plot(50) print(doc_en.count('Emma')) print(tokens_en.count('Emma')) print(en.count('Emma')) # Counts occurrences #en.dispersion_plot(['Emma', 'Frank', 'Jane']) #en.concordance('Emma', lines=5) # Find similar words; #en.similar('Emma') #en.similar('Frank') #en.collocations()
# http://www.lucypark.kr/courses/2015-ba/text-mining.html from nltk.corpus import gutenberg from nltk import regexp_tokenize import nltk # donwload corpus (just the first time!!) nltk.download('gutenberg') nltk.download('maxent_treebank_pos_tagger') nltk.download("reuters") # View files_en = gutenberg.fileids() doc_en = gutenberg.open('austen-emma.txt').read() # Tokenize pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]''' tokens_en = regexp_tokenize(doc_en, pattern) en = nltk.Text(tokens_en) print(len(en.tokens)) print(len(set(en.tokens))) en.vocab() en.plot(50) # Count en.count('Emma')
import pyLDAvis import pyLDAvis.gensim from sklearn import mixture from copy import deepcopy from sklearn.metrics.cluster import adjusted_rand_score from gensim import corpora, models from wordcloud import WordCloud from sklearn.feature_extraction.text import TfidfVectorizer nltk.download('gutenberg') from nltk.corpus import gutenberg files_en = gutenberg.fileids() selected_titles = ['3623-8.txt','19528-8.txt','24681-8.txt','29444-8.txt','milton-paradise.txt'] #Downloading and opening 5 books #upload the text text_1 = gutenberg.open('3623-8.txt').read() text_2 = gutenberg.open('19528-8.txt').read() text_3 = gutenberg.open('24681-8.txt').read() text_4 = gutenberg.open('29444-8.txt').read() text_5 = gutenberg.open('milton-paradise.txt').read() #remove numbers from the text removeNum1 = re.sub('[^a-zA-Z]',' ', text_1 ) removeNum2 = re.sub('[^a-zA-Z]',' ', text_2 ) removeNum3 = re.sub('[^a-zA-Z]',' ', text_3 ) removeNum4 = re.sub('[^a-zA-Z]',' ', text_4 ) removeNum5 = re.sub('[^a-zA-Z]',' ', text_5 ) #Tokenizing data from nltk import regexp_tokenize pattern = r'''(?x) (?:[A-Z]\.)+ | \w+(?:[-]\w+)* | \$?\d+(?:\.\d+)?%?| \.\.\. | [][.,;"'?():-_`]'''
from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix from wordcloud import WordCloud import matplotlib.pyplot as plt # In[115]: text_1 = gutenberg.open( '10985.txt').read() #The Infant System by Samuel Wilderspin text_2 = gutenberg.open('42547.txt').read( ) #The Art and Practice of Silver Printing by Abney and Robinson text_3 = gutenberg.open( '10773.txt').read() #Ancient and Modern Physics by Thomas Edgar Willson text_4 = gutenberg.open('51397.txt').read() #People Soup by Alan Arkin text_5 = gutenberg.open( '17699.txt').read() #The Evolution of Love by Emil Lucka text_6 = gutenberg.open( '29420.txt').read() #American Rural Highways by T. R. Agg text_7 = gutenberg.open('389.txt').read() #The Great God Pan by Arthur Machen # In[116]: class Preprocess(