Ejemplo n.º 1
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(normalize_tokens(content))

    w2v = Word2Vec(texts, size=100, window=5, min_count=5, workers=4)
    w2v.save(model_path)
Ejemplo n.º 2
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)
    svd = TruncatedSVD(n_components=100)
    lsa = svd.fit_transform(tf.T)

    lsa.dump(open(model_path, 'wb'))
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
Ejemplo n.º 3
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i - window_size, i + window_size + 1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[
                    text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Ejemplo n.º 4
0
def train():
    paras = brown.paras() + gutenberg.paras() + reuters.paras()
    total = len(paras)
    texts = []
    for i, para in enumerate(paras):
        if i % 1000 == 0:
            print(i, total)
        content = ' '.join(map(lambda x: ' '.join(x), para))
        texts.append(' '.join(normalize_tokens(content)))

    transformer = CountVectorizer()
    tf = transformer.fit_transform(texts)

    test_vocab = set()

    reader = csv.reader(open(global_truth_path))
    for line in reader:
        w1, w2, score = line
        test_vocab.add(stemmer.stem(w1))
        test_vocab.add(stemmer.stem(w2))
    test_vocab = {k: v for v, k in enumerate(test_vocab)}

    model = np.zeros((len(test_vocab), len(transformer.vocabulary_)))

    for text in texts:
        text = text.split()
        for i in range(len(text)):
            if text[i] not in test_vocab:
                continue
            for j in (i-window_size, i+window_size+1):
                if j < 0 or j >= len(text):
                    continue
                if text[j] not in transformer.vocabulary_:
                    continue
                model[test_vocab[text[i]]][transformer.vocabulary_[text[j]]] += 1
    model.dump(model_path)
    pickle.dump(transformer.vocabulary_, open(vocab_path, 'wb'))
    pickle.dump(test_vocab, open(test_vocab_path, 'wb'))
Ejemplo n.º 5
0
from sklearn.linear_regression import LogisticRegression


Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence', 'text_source'], 1))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,random_state=42)

lr = LogisticRegression()
train = lr.fit(X_train, Y_train)
print('Training set score:', lr.score(X_train, Y_train))

# ------------------------------------------------------------------------------
# tfidf

paragraphs = gutenberg.paras('austen-sense.txt')
paragraphs_clean = []
for paragraph in paragraphs:
    paragraph = paragraph[0]
    paragraph = [re.sub(r'--', '', word) for word in paragraph]
    paragraphs_clean.append(' '.join(paragraph))

from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(paragraphs_clean, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=2,
                             stop_words='english',
                             lowercase=True,
                             use_idf=True,
Ejemplo n.º 6
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk.data

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from nltk.corpus import gutenberg
nltk.download('punkt')
nltk.download('gutenberg')
import re
from sklearn.model_selection import train_test_split

#reading in the data, this time in the form of paragraphs
emma=gutenberg.paras('austen-emma.txt')
#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])

from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)
# Korpus als Liste von Wörtern
# (Wort ist Zeichenkette)
emma_words = gutenberg.words(filename)
print 60*'*', '\nKorpus als Liste von Woertern:'
print emma_words[-30:] #die letzten 30 Wörter

# Korpus als Liste von Sätzen
# (Satz ist Liste von Wörten)
emma_sents = gutenberg.sents(filename)
print 60*'*', '\nKorpus als Liste von Saetzen:'
print emma_sents[-5:] #die letzten 5 Sätze

# Korpus als Liste von Paragraphen
# (Paragraph ist Liste von Sätzen)
emma_paras = gutenberg.paras(filename)
print 60*'*', '\nKorpus als Liste von Paragraphen:'
print emma_paras[-2:] #die letzten 2 Paragraphen

# Wieviele Paragraphen?
number_of_paras=len(gutenberg.paras(filename))
print "\n number of paras: ",number_of_paras,"\n"

# Wieviele Sätze?
number_of_sents=len(gutenberg.sents(filename))
print "\n number of sents: ",number_of_sents,"\n"

# Wieviele Sätze pro Paragraph im Schnitt?
emma_paras_sents_mean=number_of_sents/number_of_paras
print "\n number sents per para: ",emma_paras_sents_mean,"\n"
Ejemplo n.º 8
0
print(len(set(text1)))
print(text1[:10])
print(text2[:10])


# In[2]:


from nltk.corpus import gutenberg
print(gutenberg.fileids())
hamlet = gutenberg.words('shakespeare-hamlet.txt')
print(len(hamlet))
hamlet_sentences = gutenberg.sents('shakespeare-hamlet.txt')
print(len(hamlet_sentences))
print(hamlet_sentences[1024])
print(len(gutenberg.paras('shakespeare-hamlet.txt')))


# ### Get the count of a word in a document, or the context of every occurence of a word in a document.

# In[68]:


print(text1.count('horse'))
print(text1.concordance('passion'))
print(text2.concordance('passion'))


# **FreqDist and most_common**  
# We can use FreqDist to find the number of occurrences of each word in the text.  
# By getting len(vocab) we get the number of unique words in the text (including punctuation).  
Ejemplo n.º 9
0
import csv
"""
This script is an alternative/demo to scrapeWordHunt.py, but is not
used in this folder.
"""
SOURCE_NAME = "GUT"

txt_file_name = "analogy_sentences_GUT.txt"
csv_file_name = "analogy_names_GUT.csv"
output_handler = open(root + "extractions\\" + txt_file_name,
                      "w",
                      encoding="utf-8")

# Find the indices of all paragraphs that contain the patterns as listed in
# analogy_string_list
paras = gutenberg.paras()
para_indices = find_any_patterns(paras, analogy_string_list)
ids = {}  # save sentences' ids in hash table to prevent duplicates.
# Extract the exact sentences and write them to csv and txt files.
with open(root + "extractions\\" + csv_file_name, 'w',
          encoding="utf-8") as csvfile:
    fieldnames = ['name', 'text']
    writer = csv.DictWriter(csvfile,
                            fieldnames=fieldnames,
                            quoting=csv.QUOTE_ALL,
                            lineterminator='\n')
    writer.writeheader()
    for para_index in para_indices:
        sentence_pos = get_analogy_sentence(paras[para_index],
                                            analogy_string_list)
        # get_analogy_sentence returns a 2-element tuple. The first element is the analogy string,