def main(): #create bits from nltk.corpus import webtext overheard_sents = webtext.sents('overheard.txt') convo1 = conversation.Conversation(overheard_sents[0:3],"white_and_asian") print "Participants:", convo1.participants
def create_corpus(): """Removing punctuations from the sentences of corpus.""" # Here, I am taking a text from rueters, webtext and brown corpus here. rr_corpus = reuters.sents() + webtext.sents() + brown.sents() punctuations = [p for p in string.punctuation] cleaned_corpus = [] for idx in range(len(rr_corpus)): cleaned_corpus.append( [w for w in rr_corpus[idx] if w not in punctuations]) return cleaned_corpus
def process_webtext(): print 'webtext' from nltk.corpus import webtext count = 0 word = 'bank' sen1 = 'depository_financial_institution.n.01' sen2 = 'bank.n.01' file_name = 'data/bank_webtext_tmp.txt' for f in webtext.fileids(): sents = webtext.sents(f) for i in range(len(sents)): sent = sents[i] if (word in sent): appendToFile(file_name, sentToStr(sent, '0')) count = count + 1 print count
def get_default_sentences() -> list: nltk.download('brown') brown_tokenized_sentences = brown.sents() brown_sentences = detok_sentences(brown_tokenized_sentences) nltk.download('gutenberg') nltk.download('punkt') gutenberg_tokenized_sentences = gutenberg.sents() gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences) nltk.download('reuters') reuters_tokenized_sentences = reuters.sents() reuters_sentences = detok_sentences(reuters_tokenized_sentences) nltk.download('webtext') webtext_tokenized_sentences = webtext.sents() webtext_sentences = detok_sentences(webtext_tokenized_sentences) nltk.download('inaugural') inaugural_tokenized_sentences = inaugural.sents() inaugural_sentences = detok_sentences(inaugural_tokenized_sentences) return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
def webtext(): from nltk.corpus import webtext as webtext from nltk.corpus import nps_chat # list comprehension version file_ids = [fileid for fileid in webtext.fileids()] chat_file_ids = [fileid for fileid in nps_chat.fileids()] pirates = webtext.raw('pirates.txt') pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) lexical_diversity = lexical_div(uniqs, pirates_words) # import nltk.book as book # text1 = book.text1 # pirates = webtext.raw('pirates.txt') return render_template('webtext.html', file_ids=file_ids, chat_file_ids=chat_file_ids, pirates=pirates)
import matplotlib.pyplot as plt ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='firefox.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0 VBD_count = 0 VBZ_count = 0 JJ_count = 0 WP_count = 0 NN_Num = 0 NNS_Num = 0
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True) elapsed_time = time.time() - start print(("elapsed_time with cython:{} [sec]".format(elapsed_time))) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
#Importing data import nltk from nltk.corpus import webtext from nltk.probability import FreqDist from nltk.corpus import stopwords import string nltk.download('webtext') wt_sentences = webtext.sents('firefox.txt') wt_words = webtext.words('firefox.txt') print(len(wt_sentences)) print(len(wt_words))
import heapq ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='overheard.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0 VBD_count = 0 VBZ_count = 0 JJ_count = 0 WP_count = 0 NN_Num = 0 NNS_Num = 0
#Using a windows terminal? Run "CHCP 65001" before this script to enable Unicode Character being printed to the terminal. from nltk.corpus import webtext, stopwords #from nltk.stem.porter import PorterStemmer import pickle stopWords = set(stopwords.words('english')) corpus = webtext #stemmer = PorterStemmer() sentences = [[ token.lower() for token in s if token.isalpha() and token not in stopWords ] for s in webtext.sents()] #sentences = [[stemmer.stem(token.lower()) for token in s if token.isalpha()] for s in webtext.sents()] words = set([word for sen in sentences for word in sen]) coocMap = {} def addToMap(coocMap, w1, w2, val): if w1 in coocMap: if w2 in coocMap[w1]: coocMap[w1][w2] += val else: coocMap[w1][w2] = val else: coocMap[w1] = {w2: val} return coocMap
print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents") treebank_corb_sents = treebank.sents() print("Freebank to sents") reuters_corp_sents = reuters.sents() print("Reuters to sents") webtext_corp_sents = webtext.sents() print("Webtext to sents") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Cleaning data ...") discard_punctuation_and_lowercased_sents_condll2007 = [[ word.lower() for word in sent if word not in punctuation ] for sent in conll2007_corp_sents] discard_punctuation_and_lowercased_sents_condll2000 = [[ word.lower() for word in sent if word not in punctuation ] for sent in conll2000_corp_sents]
sentences.append(' '.join(tok_sent).strip()) return sentences print("Loading sentences.") nltk.download('brown') brown_tokenized_sentences = brown.sents() brown_sentences = detok_sentences(brown_tokenized_sentences) nltk.download('gutenberg') nltk.download('punkt') gutenberg_tokenized_sentences = gutenberg.sents() gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences) nltk.download('reuters') reuters_tokenized_sentences = reuters.sents() reuters_sentences = detok_sentences(reuters_tokenized_sentences) nltk.download('webtext') webtext_tokenized_sentences = webtext.sents() webtext_sentences = detok_sentences(webtext_tokenized_sentences) nltk.download('inaugural') inaugural_tokenized_sentences = inaugural.sents() inaugural_sentences = detok_sentences(inaugural_tokenized_sentences) all_sentences = brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences outfile = codecs.open('output.txt', 'w') for sentence in all_sentences: cleaned_sentence = sentence.replace(" ' s ", "'s ") cleaned_sentence = cleaned_sentence.replace("n ' t ", "n't ") cleaned_sentence = cleaned_sentence.replace(" ,", ",") cleaned_sentence = cleaned_sentence.replace(" .", ".") outfile.write('{}\n'.format(cleaned_sentence))
import logging import re from gensim.parsing import PorterStemmer from StemmingHelper import StemmingHelper global_stemmer = PorterStemmer() ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words my_corpus = [] regex = re.compile('[^a-zA-Z]+') #unicode sentences #transform into strings for file in webtext.fileids(): my_sentences = [] for sentences in webtext.sents(file): words = [ StemmingHelper.stem(str(regex.sub('', word))).lower() for word in sentences if StemmingHelper.stem(str(regex.sub('', word))).lower() != '' ] print words my_sentences.append(words) my_corpus = my_corpus + my_sentences print my_corpus min_count = 2 size = 50 window = 4 model = Word2Vec(my_corpus, min_count=min_count, size=size, window=window)
start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown # for fileid in webtext.fileids(): # print fileid, webtext.raw(fileid)[:65] # for fileId in nps_chat.fileids(): # print fileId pirates = webtext.raw('pirates.txt') pirates_char = len(webtext.raw('pirates.txt')) pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) def lexical_div(un, total): return total/un print 'lexical diversity: ', lexical_div(uniqs, pirates_words) # brown_categories = brown.categories() # for genre in brown_categories: # print genre news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) # modal verbs
textId = webtext.fileids() print(textId) # "케리비안 해적"의 영화 대본 텍스트 문서를 조회한다. # http://www.actorpoint.com/movie-scripts/scripts/pirates-of-the-caribbean-dead-man%27s-chest.html text = webtext.raw('pirates.txt') print(text[:1000]) print("문자 개수 = ", len(text)) # 문서를 word 단위로 읽어온다. word = webtext.words('pirates.txt') print(word) print("word 개수 = ", len(word)) # 문서를 문장 단위로 읽어온다. sentence = webtext.sents('pirates.txt') for i in range(5): print(sentence[i]) print("문장 개수 = ", len(sentence)) # Firefox의 게시판 텍스트 문서를 조회한다. text = webtext.raw('firefox.txt') print(text[:1000]) print("문자 개수 = ", len(text)) # 문서를 word 단위로 읽어온다. word = webtext.words('firefox.txt') print(word) print("word 개수 = ", len(word)) # 문서를 문장 단위로 읽어온다.