def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(normalize_tokens(content)) w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4) w2v.save(model_path)
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) svd = TruncatedSVD(n_components=100) lsa = svd.fit_transform(tf.T) lsa.dump(open(model_path, 'wb')) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i - window_size, i + window_size + 1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[ text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
def train(): paras = brown.paras() + gutenberg.paras() + reuters.paras() total = len(paras) texts = [] for i, para in enumerate(paras): if i % 1000 == 0: print(i, total) content = ' '.join(map(lambda x: ' '.join(x), para)) texts.append(' '.join(normalize_tokens(content))) transformer = CountVectorizer() tf = transformer.fit_transform(texts) test_vocab = set() reader = csv.reader(open(global_truth_path)) for line in reader: w1, w2, score = line test_vocab.add(stemmer.stem(w1)) test_vocab.add(stemmer.stem(w2)) test_vocab = {k: v for v, k in enumerate(test_vocab)} model = np.zeros((len(test_vocab), len(transformer.vocabulary_))) for text in texts: text = text.split() for i in range(len(text)): if text[i] not in test_vocab: continue for j in (i-window_size, i+window_size+1): if j < 0 or j >= len(text): continue if text[j] not in transformer.vocabulary_: continue model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1 model.dump(model_path) pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb')) pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
from sklearn.linear_regression import LogisticRegression Y = word_counts['text_source'] X = np.array(word_counts.drop(['text_sentence', 'text_source'], 1)) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,random_state=42) lr = LogisticRegression() train = lr.fit(X_train, Y_train) print('Training set score:', lr.score(X_train, Y_train)) # ------------------------------------------------------------------------------ # tfidf paragraphs = gutenberg.paras('austen-sense.txt') paragraphs_clean = [] for paragraph in paragraphs: paragraph = paragraph[0] paragraph = [re.sub(r'--', '', word) for word in paragraph] paragraphs_clean.append(' '.join(paragraph)) from sklearn.feature_extraction.text import TfidfVectorizer X_train, X_test = train_test_split(paragraphs_clean, test_size=0.4, random_state=0) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', lowercase=True, use_idf=True,
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import nltk.data from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from nltk.corpus import gutenberg nltk.download('punkt') nltk.download('gutenberg') import re from sklearn.model_selection import train_test_split #reading in the data, this time in the form of paragraphs emma=gutenberg.paras('austen-emma.txt') #processing emma_paras=[] for paragraph in emma: para=paragraph[0] #removing the double-dash from all words para=[re.sub(r'--','',word) for word in para] #Forming each paragraph into a string and adding it to the list of strings. emma_paras.append(' '.join(para)) print(emma_paras[0:4]) from sklearn.feature_extraction.text import TfidfVectorizer X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)
# Korpus als Liste von Wörtern # (Wort ist Zeichenkette) emma_words = gutenberg.words(filename) print 60*'*', '\nKorpus als Liste von Woertern:' print emma_words[-30:] #die letzten 30 Wörter # Korpus als Liste von Sätzen # (Satz ist Liste von Wörten) emma_sents = gutenberg.sents(filename) print 60*'*', '\nKorpus als Liste von Saetzen:' print emma_sents[-5:] #die letzten 5 Sätze # Korpus als Liste von Paragraphen # (Paragraph ist Liste von Sätzen) emma_paras = gutenberg.paras(filename) print 60*'*', '\nKorpus als Liste von Paragraphen:' print emma_paras[-2:] #die letzten 2 Paragraphen # Wieviele Paragraphen? number_of_paras=len(gutenberg.paras(filename)) print "\n number of paras: ",number_of_paras,"\n" # Wieviele Sätze? number_of_sents=len(gutenberg.sents(filename)) print "\n number of sents: ",number_of_sents,"\n" # Wieviele Sätze pro Paragraph im Schnitt? emma_paras_sents_mean=number_of_sents/number_of_paras print "\n number sents per para: ",emma_paras_sents_mean,"\n"
print(len(set(text1))) print(text1[:10]) print(text2[:10]) # In[2]: from nltk.corpus import gutenberg print(gutenberg.fileids()) hamlet = gutenberg.words('shakespeare-hamlet.txt') print(len(hamlet)) hamlet_sentences = gutenberg.sents('shakespeare-hamlet.txt') print(len(hamlet_sentences)) print(hamlet_sentences[1024]) print(len(gutenberg.paras('shakespeare-hamlet.txt'))) # ### Get the count of a word in a document, or the context of every occurence of a word in a document. # In[68]: print(text1.count('horse')) print(text1.concordance('passion')) print(text2.concordance('passion')) # **FreqDist and most_common** # We can use FreqDist to find the number of occurrences of each word in the text. # By getting len(vocab) we get the number of unique words in the text (including punctuation).
import csv """ This script is an alternative/demo to scrapeWordHunt.py, but is not used in this folder. """ SOURCE_NAME = "GUT" txt_file_name = "analogy_sentences_GUT.txt" csv_file_name = "analogy_names_GUT.csv" output_handler = open(root + "extractions\\" + txt_file_name, "w", encoding="utf-8") # Find the indices of all paragraphs that contain the patterns as listed in # analogy_string_list paras = gutenberg.paras() para_indices = find_any_patterns(paras, analogy_string_list) ids = {} # save sentences' ids in hash table to prevent duplicates. # Extract the exact sentences and write them to csv and txt files. with open(root + "extractions\\" + csv_file_name, 'w', encoding="utf-8") as csvfile: fieldnames = ['name', 'text'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL, lineterminator='\n') writer.writeheader() for para_index in para_indices: sentence_pos = get_analogy_sentence(paras[para_index], analogy_string_list) # get_analogy_sentence returns a 2-element tuple. The first element is the analogy string,