def load_genesis_corpus() -> np.ndarray: nltk.download('genesis') sentences = list( filter( lambda sent: (len(sent) <= 30) and (len(sent) >= 1) and any(map(lambda word: word.isalpha(), sent)), genesis.sents())) mdetok = TreebankWordDetokenizer() return np.array(list( map( lambda sent: mdetok.detokenize( (' '.join(sent).replace('``', '"').replace("''", '"').replace( '`', "'")).split()), sentences)), dtype=object)
#print("SEARCH TERM: "+thing) #print(wikipedia.page(thing)) #print(wikipedia.page(thing).content) pages += wikipedia.page(thing).content except wikipedia.DisambiguationError as e: s = random.choice(e.options) get_wiki(s) pass except: pass b = brown.sents() sents = tokenizer.tokenize(pages) sense = gutenberg.sents('austen-sense.txt') emma = gutenberg.sents('austen-emma.txt') persuasion = gutenberg.sents('austen-persuasion.txt') bible = genesis.sents('english-kjv.txt') blake = gutenberg.sents('blake-poems.txt') bryant = gutenberg.sents('bryant-stories.txt') burgess = gutenberg.sents('burgess-busterbrown.txt') carroll = gutenberg.sents('carroll-alice.txt') ch_ball = gutenberg.sents('chesterton-ball.txt') ch_brown = gutenberg.sents('chesterton-brown.txt') ch_thurs = gutenberg.sents('chesterton-thursday.txt') edge = gutenberg.sents('edgeworth-parents.txt') mel = gutenberg.sents('melville-moby_dick.txt') mil = gutenberg.sents('milton-paradise.txt') caesar = gutenberg.sents('shakespeare-caesar.txt') hamlet = gutenberg.sents('shakespeare-hamlet.txt') macbeth = gutenberg.sents('shakespeare-macbeth.txt') whit = gutenberg.sents('whitman-leaves.txt') rural = abc.sents('rural.txt')
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True) elapsed_time = time.time() - start print(("elapsed_time with cython:{} [sec]".format(elapsed_time))) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
#print(raw_sentences[5]) #print(book_sentences[5]) conll2000_corp_sents = conll2000.sents() print("condll2000 to sents") conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents") treebank_corb_sents = treebank.sents() print("Freebank to sents")
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) def singles(words): if len(words) < 1: return for w in words: if re.match("[a-zA-Z'-]+", w) and w.strip() != "''": yield w def doubles(sentences): for s in sentences: s = s.split(' ') if len(s) < 2: continue
start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)