def describe(self): self.freqs = Counter(self.words_per_sentence) nlp_logger.warning("Lexical diversity is {:,.3}".format(self.lexical_diversity())) describe(self.words_per_sentence, "{} - # of words".format(self.name))
def from_file(self, filename): # loads the serialized book from file with open(filename, 'rb') as infile: self.__dict__ = pickle.load(infile) nlp_logger.warning("{} doc loaded from disk".format(self.name)) self.describe()
import stanfordnlp import os from constants import * from the_logger import nlp_logger from book import Book nlp_logger.warning("----") nlp_logger.warning("Process starts") pym = Book("Arthur Gordon Pym") tom = Book("Tom Sawyer") eureka = Book("Eureka") huck = Book("Huckleberry Finn") en_nlp = stanfordnlp.Pipeline() nlp_logger.warning("Pipeline read") crop_value = -1 # Read and parse texts used for training # Parsing takes time, even with GPU! pym.load_corpus(os.path.join(CORPORA_FOLDER, "pym.txt")) pym.parse(en_nlp, crop_value) pym.to_file(PYM_FILE) tom.load_corpus(os.path.join(CORPORA_FOLDER, "tom.txt")) tom.parse(en_nlp, crop_value) tom.to_file(TOM_FILE)
tom.from_file(TOM_FILE) # replace word with POS in sentences pym.encode_as_pos() tom.encode_as_pos() # Now we have to prepare the data for training pym_base_set = pym.get_base_training_set() tom_base_set = tom.get_base_training_set() # Prepare training set for sequential network (sequential_set, writer_labels) = Book.get_seq_training_set(pym_base_set, tom_base_set, num_sentences=3) nlp_logger.warning("writing {} sentences to training set file".format( len(sequential_set))) with open(TRAINING_SET_FILE, "wb") as outfile: pickle.dump(sequential_set, outfile) with open(LABELS_FILE, "wb") as outfile: pickle.dump(writer_labels, outfile) # Prepare training set for multi input network (multi_set, multi_writer_labels) = Book.get_multi_training_set(pym_base_set, tom_base_set, num_sentences=3) nlp_logger.warning( "writing {} sentences to multi sentence training set file".format( multi_set[0].shape[0]))
# read the model with open(MODEL_FILE, "rb") as infile: sequential_model = pickle.load(infile) # for simplicity we ignore parapgraphs that are longer than the longest one found in training max_expected_length = sequential_model.input_shape[1] # apply the model to pym pym_validation = pad( [u for u in pym_validation if u.shape[0] <= max_expected_length], max_expected_length) pym_validation = np.asarray(pym_validation) pym_predictions = sequential_model.predict(pym_validation) pym_accuracy = sum([probs[0] > 0.5 for probs in pym_predictions]) / len(pym_predictions) nlp_logger.warning("Accuracy for Poe/pym: {:.4f}".format(pym_accuracy)) # apply the model to tom tom_validation = pad( [u for u in tom_validation if u.shape[0] <= max_expected_length], max_expected_length) tom_validation = np.asarray(tom_validation) tom_predictions = sequential_model.predict(tom_validation) tom_accuracy = sum([probs[1] > 0.5 for probs in tom_predictions]) / len(tom_predictions) nlp_logger.warning("Accuracy for Twain/tom: {:.4f}".format(tom_accuracy)) # apply the model to Eureka eureka_validation = pad( [u for u in eureka_validation if u.shape[0] <= max_expected_length], max_expected_length)
import pickle import random from biblio_eater import BiblioEater from constants import * from the_logger import nlp_logger from padder import pad # Load training set from disk first with open(TRAINING_SET_FILE, "rb") as infile: training_set = pickle.load(infile) # pad with zero rows up to max sentence length max_length = max([s.shape[0] for s in training_set]) training_set = pad(training_set, max_length) with open(LABELS_FILE, "rb") as infile: writer_labels = pickle.load(infile) # Sequential network nlp_logger.warning("Shape of training set ({}, {})".format( training_set[0].shape[0], training_set[0].shape[1])) # Prepare net biblio_eater = BiblioEater() # all sentences are padded to same length, althoug Keras has a padding option that we are not using biblio_eater.design_sequential_net(training_set[0].shape[0], training_set[0].shape[1]) biblio_eater.train_sequential_net(training_set, writer_labels)
def to_file(self, filename): # serializes book for further processing with open(filename, 'wb') as outfile: pickle.dump(self.__dict__, outfile) nlp_logger.warning("{} doc serialized to disk".format(self.name))
def parse(self, pipeline, crop_value): self.doc = pipeline(self.crop(crop_value)) nlp_logger.warning("{} doc processed with {} sentences".format(self.name, len(self.doc.sentences))) torch.cuda.empty_cache() self.compute_describe_freqs()
def load_corpus(self, corpus_filename): with open(corpus_filename, "r", encoding="utf-8") as f: self.corpus = f.read() nlp_logger.warning("{} corpus loaded".format(self.name))