def onehot_char_training_data(lang='deu', n=700, data_paths=()): df = get_data(lang) n = int(len(df) * n) if n <= 1 else n df = df.iloc[:n] input_texts, target_texts = [], [] # <1> input_vocabulary = set() # <3> output_vocabulary = set() start_token, stop_token = '\t\n' # <2> n = len(df) for input_text, target_text in tqdm(zip(df.eng, df[lang]), total=n): target_text = start_token + target_text \ + stop_token # <7> input_texts.append(input_text) target_texts.append(target_text) for char in input_text: # <8> if char not in input_vocabulary: input_vocabulary.add(char) for char in target_text: if char not in output_vocabulary: output_vocabulary.add(char) input_vocabulary = sorted(input_vocabulary) # <1> output_vocabulary = sorted(output_vocabulary) input_vocab_size = len(input_vocabulary) # <2> output_vocab_size = len(output_vocabulary) max_encoder_seq_length = max([len(txt) for txt in input_texts]) # <3> max_decoder_seq_length = max([len(txt) for txt in target_texts]) input_token_index = dict([ (char, i) for i, char in enumerate(input_vocabulary) ]) # <4> target_token_index = dict([(char, i) for i, char in enumerate(output_vocabulary)]) encoder_input_data = np.zeros( (n, max_encoder_seq_length, input_vocab_size), dtype='float32') # <2> decoder_input_data = np.zeros( (n, max_decoder_seq_length, output_vocab_size), dtype='float32') decoder_target_data = np.zeros( (n, max_decoder_seq_length, output_vocab_size), dtype='float32') for i, (input_text, target_text) in enumerate( tqdm(zip(input_texts, target_texts), total=len(target_texts))): # <3> for t, char in enumerate(input_text): # <4> encoder_input_data[i, t, input_token_index[char]] = 1. # <5> for t, char in enumerate(target_text): # <6> decoder_input_data[i, t, target_token_index[char]] = 1. if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1 trainset = (encoder_input_data, decoder_input_data, decoder_target_data) for i, p in enumerate(data_paths): np.save(p, trainset[i][:n], allow_pickle=False) return encoder_input_data, decoder_input_data, decoder_target_data
def load_data(data_dir=None): data_dir = data_dir or os.path.join(os.path.sep + 'midata', 'viddesc') descriptions = pd.read_table(os.path.join(data_dir, 'LSMDC16_annos_training.csv'), header=None) descriptions.columns = 'filename start_2s end_2s start end description'.split( ) embeddings = load_embeddings(os.path.join(data_dir, 'embeddings')) wv = get_data('word2vec') return wv, descriptions, embeddings
def load_dialog(self, name='movie_dialog'): if name == 'dsfaq': db = load_faq() else: db = get_data(name) log.info(f'Loaded {len(db)} {self.name} statement-reply pairs.') if self.limit <= len(db): log.info( f'Limiting {self.name} database to {self.limit} statement-reply pairs.' ) db = db.iloc[:self.limit] db = dict(zip(db[db.columns[0]], db[db.columns[1]])) return db
def get_word_vectors(vocab): """ Create a word2vec embedding matrix for all the words in the vocab """ wv = get_data('word2vec') vectors = np.array(len(vocab), len(wv['the'])) for i, tok in enumerate(vocab): word = tok[0] variations = (word, word.lower(), word.lower()[:-1]) for w in variations: if w in wv: vectors[i, :] = wv[w] if not np.sum(np.abs(vectors[i])): logger.warning('Unable to find {}, {}, or {} in word2vec.'.format(*variations)) return vectors
def get_anki_phrases_english(limit=None): """ Return all the English phrases in the Anki translation flashcards >>> len(get_anki_phrases_english(limit=100)) > 700 True """ texts = set() for lang in ANKI_LANGUAGES: df = get_data(lang) phrases = df.eng.str.strip().values texts = texts.union(set(phrases)) if limit and len(texts) >= limit: break return sorted(texts)
def get_anki_phrases(lang='english', limit=None): """ Retrieve as many anki paired-statement corpora as you can for the requested language If `ankis` (requested languages) is more than one, then get the english texts associated with those languages. TODO: improve modularity: def function that takes a single language and call it recursively if necessary >>> get_anki_phrases('afr')[:2] ["'n Groen piesang is nie ryp genoeg om te eet nie.", "'n Hond het agter die kat aan gehardloop."] """ lang = lang.strip().lower()[:3] lang = LANG2ANKI[lang[:2]] if lang not in ANKI_LANGUAGES else lang if lang[:2] == 'en': return get_anki_phrases_english(limit=limit) return sorted(get_data(lang).iloc[:, -1].str.strip().values)
import os import re import sys import pandas as pd import numpy as np from tqdm import tqdm from nlpia.loaders import get_data if len(sys.argv) > 1: lang = sys.argv[1][:3].lower() else: lang = 'spa' df = get_data(lang) if lang not in df.columns: # print(df.columns) print(f"changing language name {lang} to {list(df.columns)[-1]}") lang = list(df.columns)[-1] input_texts, target_texts = [], [] # <1> input_vocabulary = set() # <3> output_vocabulary = set() start_token, stop_token = '\t\n' # <2> n = int(len(df) * .1) encoder_input_path = 'encoder_input_data-{}-{}.np'.format(lang, n) decoder_input_path = 'decoder_input_data-{}-{}.np'.format(lang, n) decoder_target_path = 'decoder_target_data-eng-{}.np'.format(n)
@author: tonymullen """ import numpy as np import shared from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Conv1D, GlobalMaxPooling1D from nlpia.loaders import get_data np.random.seed(1337) word_vectors = get_data('w2v', limit=200000) data_file_root = '/Users/tonymullen/Dropbox/Northeastern/Classes/NLP/Datasets' # https://ai.stanford.edu/~amaas/data/sentiment/ number_of_files = 5000 dataset = shared.pre_process_data(data_file_root + '/aclimdb/train', number_of_files) # dataset = shared.pre_process_data(data_file_root + '/miniImdb/train') vectorized_data = shared.tokenize_and_vectorize(dataset, word_vectors) expected = shared.collect_expected(dataset) split_point = int(len(vectorized_data) * .8) x_train = vectorized_data[:split_point] y_train = expected[:split_point]
""" Example python snippets and listing in [Chapter 6](http://bit.ly/ghnlpia)""" # import pandas as pd import numpy as np from nlpia.loaders import get_data from sklearn.decomposition import PCA wv = get_data('word2vec') """ >>> from nlpia.loaders import get_data >>> wv = get_data('word2vec') >>> naive_vector = wv['woman'] + wv['Europe'] + wv[physics'] +\ ... wv['scientist'] >>> naive_vector array([ 0.87109375, -0.08511353, 0.7817383 , 0.25634766, -0.10058594, ... 0.20800781, 0.06420898, 0.09033203, 0.8261719 , -0.2545166 ], dtype=float32) >>> wv.similar_by_vector(naive_vector) [('scientist', 0.7510349750518799), ('physicist', 0.7328184843063354), ('physics', 0.7083248496055603), ('theoretical_physicist', 0.6038039922714233), ('astrophysicist', 0.6009320020675659), ('mathematician', 0.5989038944244385), ('particle_physicist', 0.5962826013565063), ('Physicist', 0.5940043926239014), ('biochemist', 0.5833224058151245), ('physicists', 0.577854573726654)]
>>> for input_text, target_text in zip(df.statement, df.reply): ... target_text = start_token + target_text \ ... + stop_token # <5> ... input_texts.append(input_text) ... target_texts.append(target_text) ... for char in input_text: # <6> ... if char not in input_vocabulary: ... input_vocabulary.add(char) ... for char in target_text: ... if char not in output_vocabulary: ... output_vocabulary.add(char) """ import os from nlpia.loaders import get_data, DATA_PATH df = get_data(os.path.join(DATA_PATH, '..', 'book', 'data', 'dialog.txt')) df.columns = 'statement reply'.split() df = df.fillna(' ') input_texts, target_texts = [], [] # <1> input_vocabulary = set() # <2> output_vocabulary = set() start_token = '\t' # <3> stop_token = '\n' max_training_samples = min(25000, len(df) - 1) # <4> for input_text, target_text in zip(df.statement, df.reply): target_text = start_token + target_text \ + stop_token # <5> input_texts.append(input_text) target_texts.append(target_text) for char in input_text: # <6>
import glob import os from random import shuffle from nltk.tokenize import TreebankWordTokenizer from nlpia.loaders import get_data word_vectors = get_data('wv') def pre_process_data(filepath): ... """ ... Load pos and neg examples from separate dirs then shuffle them ... together. ... """ ... positive_path = os.path.join(filepath, 'pos') ... negative_path = os.path.join(filepath, 'neg') ... pos_label = 1 ... neg_label = 0 ... dataset = [] ... for filename in glob.glob(os.path.join(positive_path, '*.txt')): ... with open(filename, 'r') as f: ... dataset.append((pos_label, f.read())) ... for filename in glob.glob(os.path.join(negative_path, '*.txt')): ... with open(filename, 'r') as f: ... dataset.append((neg_label, f.read())) ... shuffle(dataset) ... return dataset >>> def tokenize_and_vectorize(dataset): ... tokenizer = TreebankWordTokenizer() ... vectorized_data = [] ... for sample in dataset:
>>> for reply in df.reply: ... output_vocab.update(set(reply)) >>> input_vocab = tuple(sorted(input_vocab)) >>> output_vocab = tuple(sorted(output_vocab)) >>> input_vocabulary = tuple(sorted(input_vocab)) >>> output_vocabulary = tuple(sorted(output_vocab)) >>> max_encoder_seq_len = df.statement.str.len().max() # <3> >>> max_decoder_seq_len = df.target.str.len().max() >>> max_encoder_seq_len, max_decoder_seq_len (100, 102) """ import os from nlpia.loaders import get_data df = get_data('moviedialog') df.columns = 'statement reply'.split() df = df.dropna() input_texts, target_texts = [], [] # <1> start_token, stop_token = '\t\n' # <3> input_vocab = set() # <2> output_vocab = set(start_token + stop_token) n_samples = min(100000, len(df)) # <4> df['target'] = start_token + df.reply + stop_token [input_vocab.update(set(statement)) for statement in df.statement] [output_vocab.update(set(reply)) for reply in df.reply] input_vocab = tuple(sorted(input_vocab)) #<6> output_vocab = tuple(sorted(output_vocab)) max_encoder_seq_len = df.statement.str.len().max()
# script adopted from https://gist.github.com/lampts/026a4d6400b1efac9a13a3296f16e655 import gensim import numpy as np import tensorflow as tf from nlpia.loaders import get_data from tensorflow.contrib.tensorboard.plugins import projector words = ('Sacramento', 'California', 'Oregon', 'Salem', 'Washington', 'Olympia') # loading your gensim # model = gensim.models.KeyedVectors.load_word2vec_format('~/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000) model = get_data('w2v', limit=200000) # <1> # project part of vocab, 10K of 300 dimension w2v_10K = np.zeros((6, 300)) with open("/Users/hannes/Downloads/prefix_metadata.tsv", 'w+') as file_metadata: # for i, word in enumerate(model.index2word[:200000]): # w2v_10K[i] = model[word] # file_metadata.write(word.encode('utf-8') + '\n') for i, word in enumerate(list(words)): w2v_10K[i] = model[word] file_metadata.write(word.encode('utf-8') + '\n') # define the model without training sess = tf.InteractiveSession() with tf.device("/cpu:0"):
from __future__ import print_function, unicode_literals, division, absolute_import from builtins import (bytes, dict, int, list, object, range, str, # noqa ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) from future import standard_library standard_library.install_aliases() # noqa: Counter, OrderedDict, import os from itertools import product import pandas as pd from nlpia.constants import DATA_PATH from nlpia.loaders import get_data wisconsin = get_data('sentiment-word-ratings-sensori-motor-wisconsin.csv.gz') warriner = get_data('sentiment-word-ratings-warriner.csv.gz') def get_sentiment_sensori_motor(): df = pd.read_html('http://www.neuro.mcw.edu/ratings/')[0] df.columns = ['word'] + [i.lower() + "_" + j.lower() for i, j in product(df.iloc[0][1:6], df.iloc[1][1:3])] df = df.iloc[2:] df.to_csv(os.path.join(DATA_PATH, 'sentiment-word-ratings-sensori-motor-wisconsin.csv.gz'), compression='gzip') return df def get_sentiment_warriner(): senti = pd.read_csv('http://crr.ugent.be/papers/Ratings_Warriner_et_al.csv', index_col='Word', header=0) senti.columns = [c.replace('.', '_') for c in senti.columns] del senti['Unnamed: 0']
def wordvector_training_data(lang='deu', n=700, data_paths=()): df = get_data(lang) n = int(len(df) * n) if n <= 1 else n n = min(len(df), n) df = df.iloc[:n] input_texts, target_texts = [], [] # <1> input_vocabulary = set() # <3> output_vocabulary = set() start_token, stop_token = '<START>', '<STOP>' input_tokenizer, output_tokenizer = Tokenizer(), Tokenizer() wv = get_data('word2vec') EMBEDDING_DIM = len(wv['the']) for input_text, target_text in tqdm(zip(df.eng, df[lang]), total=n): target_text = start_token + target_text + stop_token input_texts.append(input_text) target_texts.append(target_text) # texts = input_texts + target_texts # assert(len(texts) == n * 2) # input_texts = texts[:n] # target_texts = texts[n:] input_tokenizer.fit_on_texts(input_texts) output_tokenizer.fit_on_texts(target_texts) input_sequences = input_tokenizer.texts_to_sequences(input_texts) target_sequences = output_tokenizer.texts_to_sequences(target_texts) input_sequences = pad_sequences(input_sequences, maxlen=MAX_INPUT_SEQUENCE_LENGTH) target_sequences = pad_sequences(target_sequences, maxlen=MAX_TARGET_SEQUENCE_LENGTH) embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM)) for w, i in input_tokenizer.word_index.items(): if w in wv.vocab: embedding_matrix[i] = wv.word_vec(w) print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix != 0, axis=1) == 0)) # input_vocabulary = sorted(input_vocabulary) # <1> # output_vocabulary = sorted(output_vocabulary) input_vocab_size = len(input_vocabulary) # <2> output_vocab_size = len(output_vocabulary) max_encoder_seq_length = max([len(txt) for txt in input_texts]) # <3> max_decoder_seq_length = max([len(txt) for txt in target_texts]) input_token_index = dict([ (char, i) for i, char in enumerate(input_vocabulary) ]) # <4> target_token_index = dict([(char, i) for i, char in enumerate(output_vocabulary)]) encoder_input_data = np.zeros( (n, max_encoder_seq_length, input_vocab_size), dtype='float32') # <2> decoder_input_data = np.zeros( (n, max_decoder_seq_length, output_vocab_size), dtype='float32') decoder_target_data = np.zeros( (n, max_decoder_seq_length, output_vocab_size), dtype='float32') for i, (input_text, target_text) in enumerate( tqdm(zip(input_texts, target_texts), total=len(target_texts))): # <3> for t, char in enumerate(input_text): # <4> encoder_input_data[i, t, input_token_index[char]] = 1. # <5> for t, char in enumerate(target_text): # <6> decoder_input_data[i, t, target_token_index[char]] = 1. if t > 0: decoder_target_data[i, t - 1, target_token_index[char]] = 1 trainset = (encoder_input_data, decoder_input_data, decoder_target_data) for i, p in enumerate(data_paths): np.save(p, trainset[i][:n], allow_pickle=False) return encoder_input_data, decoder_input_data, decoder_target_data