コード例 #1
0
ファイル: book.py プロジェクト: angelnew/biblioeater
    def describe(self):

        self.freqs = Counter(self.words_per_sentence)

        nlp_logger.warning("Lexical diversity is {:,.3}".format(self.lexical_diversity()))

        describe(self.words_per_sentence, "{} - # of words".format(self.name))
コード例 #2
0
ファイル: book.py プロジェクト: angelnew/biblioeater
    def from_file(self, filename):
        # loads the serialized book from file
        with open(filename, 'rb') as infile:
            self.__dict__ = pickle.load(infile)

        nlp_logger.warning("{} doc loaded from disk".format(self.name))
        self.describe()
コード例 #3
0
ファイル: load_corpora.py プロジェクト: angelnew/biblioeater
import stanfordnlp
import os

from constants import *
from the_logger import nlp_logger

from book import Book

nlp_logger.warning("----")
nlp_logger.warning("Process starts")

pym = Book("Arthur Gordon Pym")
tom = Book("Tom Sawyer")
eureka = Book("Eureka")
huck = Book("Huckleberry Finn")

en_nlp = stanfordnlp.Pipeline()
nlp_logger.warning("Pipeline read")

crop_value = -1

# Read and parse texts used for training
# Parsing takes time, even with GPU!

pym.load_corpus(os.path.join(CORPORA_FOLDER, "pym.txt"))
pym.parse(en_nlp, crop_value)
pym.to_file(PYM_FILE)

tom.load_corpus(os.path.join(CORPORA_FOLDER, "tom.txt"))
tom.parse(en_nlp, crop_value)
tom.to_file(TOM_FILE)
コード例 #4
0
tom.from_file(TOM_FILE)

# replace word with POS in sentences
pym.encode_as_pos()
tom.encode_as_pos()

# Now we have to prepare the data for training
pym_base_set = pym.get_base_training_set()
tom_base_set = tom.get_base_training_set()

# Prepare training set for sequential network
(sequential_set, writer_labels) = Book.get_seq_training_set(pym_base_set,
                                                            tom_base_set,
                                                            num_sentences=3)

nlp_logger.warning("writing {} sentences to training set file".format(
    len(sequential_set)))

with open(TRAINING_SET_FILE, "wb") as outfile:
    pickle.dump(sequential_set, outfile)

with open(LABELS_FILE, "wb") as outfile:
    pickle.dump(writer_labels, outfile)

# Prepare training set for multi input network
(multi_set, multi_writer_labels) = Book.get_multi_training_set(pym_base_set,
                                                               tom_base_set,
                                                               num_sentences=3)

nlp_logger.warning(
    "writing {} sentences to multi sentence training set file".format(
        multi_set[0].shape[0]))
コード例 #5
0
# read the model
with open(MODEL_FILE, "rb") as infile:
    sequential_model = pickle.load(infile)

# for simplicity we ignore parapgraphs that are longer than the longest one found in training
max_expected_length = sequential_model.input_shape[1]

# apply the model to pym
pym_validation = pad(
    [u for u in pym_validation if u.shape[0] <= max_expected_length],
    max_expected_length)
pym_validation = np.asarray(pym_validation)
pym_predictions = sequential_model.predict(pym_validation)
pym_accuracy = sum([probs[0] > 0.5
                    for probs in pym_predictions]) / len(pym_predictions)
nlp_logger.warning("Accuracy for Poe/pym: {:.4f}".format(pym_accuracy))

# apply the model to tom
tom_validation = pad(
    [u for u in tom_validation if u.shape[0] <= max_expected_length],
    max_expected_length)
tom_validation = np.asarray(tom_validation)
tom_predictions = sequential_model.predict(tom_validation)
tom_accuracy = sum([probs[1] > 0.5
                    for probs in tom_predictions]) / len(tom_predictions)
nlp_logger.warning("Accuracy for Twain/tom: {:.4f}".format(tom_accuracy))

# apply the model to Eureka
eureka_validation = pad(
    [u for u in eureka_validation if u.shape[0] <= max_expected_length],
    max_expected_length)
コード例 #6
0
ファイル: train_seq.py プロジェクト: angelnew/biblioeater
import pickle
import random

from biblio_eater import BiblioEater
from constants import *
from the_logger import nlp_logger
from padder import pad

# Load training set from disk first
with open(TRAINING_SET_FILE, "rb") as infile:
    training_set = pickle.load(infile)
# pad with zero rows up to max sentence length
max_length = max([s.shape[0] for s in training_set])
training_set = pad(training_set, max_length)

with open(LABELS_FILE, "rb") as infile:
    writer_labels = pickle.load(infile)

# Sequential network

nlp_logger.warning("Shape of training set ({}, {})".format(
    training_set[0].shape[0], training_set[0].shape[1]))

# Prepare net
biblio_eater = BiblioEater()

# all sentences are padded to same length, althoug Keras has a padding option that we are not using
biblio_eater.design_sequential_net(training_set[0].shape[0],
                                   training_set[0].shape[1])
biblio_eater.train_sequential_net(training_set, writer_labels)
コード例 #7
0
ファイル: book.py プロジェクト: angelnew/biblioeater
    def to_file(self, filename):
        # serializes book for further processing
        with open(filename, 'wb') as outfile:
            pickle.dump(self.__dict__, outfile)

        nlp_logger.warning("{} doc serialized to disk".format(self.name))
コード例 #8
0
ファイル: book.py プロジェクト: angelnew/biblioeater
 def parse(self, pipeline, crop_value):
     self.doc = pipeline(self.crop(crop_value))
     nlp_logger.warning("{} doc processed with {} sentences".format(self.name, len(self.doc.sentences)))
     torch.cuda.empty_cache()
     self.compute_describe_freqs()
コード例 #9
0
ファイル: book.py プロジェクト: angelnew/biblioeater
 def load_corpus(self, corpus_filename):
     with open(corpus_filename, "r", encoding="utf-8") as f:
         self.corpus = f.read()
     nlp_logger.warning("{} corpus loaded".format(self.name))