def prepare_train(): global tokenizer print("prepare training data") with FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') as f: texts = pickle.load(f)[:25000] with FileIO(os.path.join(FLAGS.buckets, "texts_unsup.pkl"), mode='r+') as f: texts += pickle.load(f) tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(texts) sequence = tokenizer.texts_to_sequences(texts) sum_words = sum([len(seq) for seq in sequence]) print('there are %d words' % (sum_words)) x = np.zeros((sum_words, 1), dtype=np.int32) y = np.zeros((sum_words, 1), dtype=np.int32) index = 0 for i, seq in enumerate(sequence): for s in seq: x[index] = i y[index] = s index += 1 indice = np.arange(sum_words) np.random.shuffle(indice) x = x[indice] y = y[indice] return x, y, sum_words
def get_input(): with FileIO(os.path.join(FLAGS.buckets, "20news/texts.pkl"), mode='r+') as f: texts = pickle.load(f) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(texts[:num_train]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse=[list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse=pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"),mode='r+')) allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"),mode='r+')) print(len(allwords)) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector x_train_0 = x[:num_train] x_train_1 = x_reverse[:num_train] x_test_0 = x[num_train:] x_test_1 = x_reverse[num_train:] y_train = np.load(FileIO(os.path.join(FLAGS.buckets, "20news/Ytrain.npy"), mode='r+')) y_train = to_categorical(y_train) y_test = np.load(FileIO(os.path.join(FLAGS.buckets, "20news/Ytest.npy"), mode='r+')) y_test = to_categorical(y_test) return x_train_0, x_train_1, y_train, x_test_0, x_test_1, y_test, embedding_matrix
def f1(): try: file_path = root_dir + 'data/github/test_codevecs_npy/use.codevecs_0.npy' with FileIO(file_path, mode="rb") as fio: code_vec = np.load(fio) print('读取.npy成功', len(code_vec)) print(code_vec[0][:10]) except Exception as e: print('读取.npy失败') print(e) try: file_path = root_dir + 'data/github/vocab.apiseq.pkl' with FileIO(file_path, mode="rb") as fio: api_seq_vocab = pk.load(fio) print('读取.pkl成功', len(api_seq_vocab)) print(list(api_seq_vocab.keys())[:3]) except Exception as e: print('读取.pkl失败') print(e) try: file_path = root_dir + 'data/github/use.search.txt' with FileIO(file_path, mode="r") as fio: lines = fio.readlines() print('读取.txt成功') print(lines[0]) print(lines[1]) print(lines[2]) except Exception as e: print('读取.txt失败') print(e)
def update_datasets(self, filter=None): if filter is None: filter = self._filter file_list = [] log.info("Updateing datasets from file list: %s", self._source_file) if self._source_file.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._source_file, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._source_file) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] directory_name = "/".join(parts[:-1]) match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) input_file.close()
def get_input(): with FileIO(os.path.join(FLAGS.buckets, 'imdb/texts.pkl'), 'r+') as f: texts = pickle.load(f) word_index = pickle.load( FileIO(os.path.join(FLAGS.buckets, "word_index.pkl"), mode='r+')) ngram_index = pickle.load( FileIO(os.path.join(FLAGS.buckets, "ngram_index.pkl"), mode='r+')) sequence = [] for sentence in texts: t_s = [] for token in sentence.split(' '): if token in word_index: t_s.append(str(word_index[token])) sequence.append(t_s) new_sequence = [] for seq in sequence: t_s = [] for i in range(len(seq) - 2): s = '_'.join(seq[i:i + 3]) if s in ngram_index and ngram_index[s] <= num_ngram: t_s.append(ngram_index[s]) new_sequence.append(t_s) new_sequence = pad_sequences(new_sequence, maxlen=max_len) x_train = new_sequence[:25000] x_test = new_sequence[25000:] y_train = np.zeros((25000, ), dtype=np.float32) y_test = np.zeros((25000, ), dtype=np.float32) y_train[12500:25000] = np.ones((12500, ), dtype=np.float32) y_test[12500:25000] = np.ones((12500, ), dtype=np.float32) return x_train, y_train, x_test, y_test
def f3(): a = np.array([5, 4, 3, 2, 1]) file_path = root_dir + 'a.npy' with FileIO(file_path, mode="wb") as fio: np.save(fio, a) with FileIO(file_path, mode="rb") as fio: code_vec = np.load(fio) print('读取.npy成功', code_vec)
def get_input(): with FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+') as f: texts = pickle.load(f) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse = pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load( FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load( FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) print(len(allwords)) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector x_train_0 = x[:25000] x_train_1 = x_reverse[:25000] x_test_0 = x[25000:] x_test_1 = x_reverse[25000:] y_train = np.zeros((25000, ), dtype=np.float32) y_test = np.zeros((25000, ), dtype=np.float32) y_train[12500:25000] = np.ones((12500, ), dtype=np.float32) y_test[12500:25000] = np.ones((12500, ), dtype=np.float32) indice = np.arange(25000) np.random.shuffle(indice) x_train_0 = x_train_0[indice] x_test_0 = x_test_0[indice] x_train_1 = x_train_1[indice] x_test_1 = x_test_1[indice] y_train = y_train[indice] y_test = y_test[indice] result = [] result.append(x_train_0) result.append(x_train_1) result.append(x_test_0) result.append(x_test_1) result.append(y_train) result.append(y_test) result.append(embedding_matrix) return result
def is_database_created(username): filename = "{}.csv".format(username) file_exists = exists_in_gcp(filename) if file_exists: with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'r') as f: DATABASES[username] = pd.read_csv(f) else: DATABASES[username] = pd.DataFrame( columns=["username", "date", "cause", "spent"]) with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'w') as f: DATABASES[username].to_csv(f) return not file_exists
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) # sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index sequences = [] for i in range(50000): t = [] tokens = texts[i].lower().split(' ') for j in range(len(tokens)): index = word_index.get(tokens[j], 0) if index < num_words: t.append(index) else: t.append(0) sequences.append(t) print('Found %s unique tokens.' % len(word_index)) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int) for i in range(25000): for j in range(max_len - 3): Xtrain[i, j * 4] = data1[i, j] Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2 Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3 for i in range(25000): for j in range(max_len - 3): Xtest[i, j * 4] = data2[i, j] Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2 Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3 indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def prepare_train(): print("prepare training data") f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb') text1 = pickle.load(f) text1 = text1[:25000] f.close() f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb') text2 = pickle.load(f) f.close() texts = text1 + text2 tokenizer = Tokenizer(num_words=vocab_size) tokenizer.filters = '' tokenizer.fit_on_texts(texts) sequence = tokenizer.texts_to_sequences(texts) sequence_pad = pad_sequences(sequence, maxlen=MAX_DOCUMENT_LENGTH + 1, dtype=np.int32, padding='post', truncating='post') seq_len = [] for i in range(len(sequence)): r = len(sequence[i]) if r < MAX_DOCUMENT_LENGTH: seq_len.append(r) else: seq_len.append(MAX_DOCUMENT_LENGTH) x_1 = sequence_pad[:, :-1] y_ = sequence_pad[:, 1:] return x_1, seq_len, y_
def write_tfrecord(fname, dataset, log_every=100, pre_fn=None): """Helper function to convert dataset object into tfrecord file. fname must end with .yml or .yaml. The data will be written in a .tfr file with the same suffix. Args: dataset (Dataset): input dataset. fname (str): filename of the dataset to be saved. """ def _bytes_feature(value): """Returns a bytes_list from a string / byte.""" return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) # Preperation tfr = '.'.join(fname.split('.')[:-1] + ['tfr']) writer = tf.python_io.TFRecordWriter(tfr) tensors = dataset.make_one_shot_iterator().get_next() if pre_fn: tensors = pre_fn(tensors) dataset = dataset.map(pre_fn) types = dataset.output_types shapes = dataset.output_shapes # Sanity check assert (type(types) == dict and all(type(v) != dict for v in types.values())),\ "Only dataset of non-nested dictionary is supported." assert fname.endswith('.yml'), "Filename must end with .yml." serialized = {k: tf.serialize_tensor(v) for k, v in tensors.items()} sess = tf.Session() # Writing Loop n_parsed = 0 try: while True: features = {} example = tf.train.Example(features=tf.train.Features( feature={ key: _bytes_feature(val) for key, val in sess.run(serialized).items() })) writer.write(example.SerializeToString()) n_parsed += 1 if n_parsed % log_every == 0: sys.stdout.write('\r {} samples written to {} ...'.format( n_parsed, tfr)) sys.stdout.flush() except tf.errors.OutOfRangeError: print('\r {} samples written to {}, done.'.format(n_parsed, tfr)) sess.close() writer.close() # Write metadata format_dict = { k: { 'dtype': types[k].name, 'shape': shapes[k].as_list() } for k in types.keys() } info_dict = {'n_sample': n_parsed} with FileIO(fname, 'w') as f: yaml.safe_dump({'format': format_dict, 'info': info_dict}, f)
def load_embeddings(vocab, dim, filename): """ Load a subset of embedding vectors from file corresponding to vocabulary provided. Args: vocab: string->int map from words to their ids (id corresponds to vector's row in the resulting embedding matrix). All ids > 0. dim: embedding vector dimension filename: file where each line is a word followed by `dim` floats, all space-separated Returns: MxN = (len(vocab)+1) x dim numpy embedding matrix. The +1 for M is because 0th vector is a zero vector for padding. """ em = np.zeros((len(vocab) + 1, dim), dtype="float32") # with FileIO(filename, "r", encoding="utf-8") as f: with FileIO(filename, "r") as f: for linenum, line in enumerate(f): line = unidecode(line) idx = line.find(' ') if idx < 0: print("malformed line, no space found: line", linenum) continue word = line[:idx] if word not in vocab: continue i = vocab[word] em[i, :] = np.array(line.strip().split()[1:], dtype="float32") return em
def load_tfrecord(fname): """Load tfrecord dataset. Args: fname (str): filename of the .yml metadata file to be loaded. dtypes (dict): dtype of dataset. """ # dataset with FileIO(fname, 'r') as f: format_dict = (yaml.safe_load(f)['format']) dtypes = {k: format_dict[k]['dtype'] for k in format_dict.keys()} shapes = {k: format_dict[k]['shape'] for k in format_dict.keys()} feature_dict = {k: tf.FixedLenFeature([], tf.string) for k in dtypes} def parser(example): return tf.parse_single_example(example, feature_dict) def converter(tensors): tensors = { k: tf.parse_tensor(v, dtypes[k]) for k, v in tensors.items() } [v.set_shape(shapes[k]) for k, v in tensors.items()] return tensors tfr = '.'.join(fname.split('.')[:-1] + ['tfr']) dataset = tf.data.TFRecordDataset(tfr).map(parser).map(converter) return dataset
def fnn_model(): with FileIO(os.path.join(FLAGS.buckets, "docembed.npy"), 'r+') as f: x = np.load(f) x_train = x[:25000] x_test = x[25000:] y_train = np.zeros((25000, ), dtype=np.float32) y_test = np.zeros((25000, ), dtype=np.float32) y_train[12500:25000] = np.ones((12500, ), dtype=np.float32) y_test[12500:25000] = np.ones((12500, ), dtype=np.float32) indice = np.arange(25000) np.random.shuffle(indice) x_train = x_train[indice] x_test = x_test[indice] y_train = y_train[indice] y_test = y_test[indice] x_place = tf.placeholder(dtype=tf.float32, shape=(None, HIDDEN_SIZE)) y_place = tf.placeholder(dtype=tf.int64, shape=(None, )) # out1 = tf.layers.dense(x_place, 100, activation=None) # out2=tf.nn.relu(out1) out3 = tf.layers.dense(x_place, 2, activation=None) output = tf.nn.softmax(out3) predicted_classes = tf.argmax(output, 1) a = tf.cast(tf.equal(y_place, predicted_classes), tf.float32) accuracy = tf.reduce_mean(a) onehot_labels = tf.one_hot(y_place, 2, 1, 0) loss = tf.losses.mean_squared_error(onehot_labels, output) train_op = tf.train.AdamOptimizer().minimize(loss) # train_op = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) for i in range(10000): x, y = get_input(x_train, y_train) _loss, _acc, _ = sess.run([loss, accuracy, train_op], feed_dict={ x_place: x, y_place: y }) if i % 100 == 0: print("iter: %d loss: %f accuracy: %f" % (i, _loss, _acc), ) if i % 500 == 0: sum_acc = 0 sum_loss = 0 for j in range(25): _val_loss, _val_acc = sess.run( [loss, accuracy], feed_dict={ x_place: x_test[1000 * j:1000 * (j + 1)], y_place: y_test[1000 * j:1000 * (j + 1)] }) sum_acc += _val_acc sum_loss += _val_loss print('val acc:', sum_acc / 25, 'val loss: ', _val_loss / 25) sess.close()
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) # word_index = tokenizer.word_index # sequences = [] # for i in range(50000): # t = [] # tokens = texts[i].lower().split(' ') # for j in range(len(tokens)): # index = word_index.get(tokens[j], 0) # if index < num_words: # t.append(index) # else: # t.append(0) # sequences.append(t) data1 = pad_sequences(sequences[0:25000], maxlen=max_len) data2 = pad_sequences(sequences[25000:50000], maxlen=max_len) Ytrain = np.zeros((25000,), dtype=np.float32) Ytest = np.zeros((25000,), dtype=np.float32) Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32) Ytest[12500:25000] = np.ones((12500,), dtype=np.float32) Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int) for i in range(25000): for j in range(max_len - 1): Xtrain[i, j * 2] = data1[i, j] Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words for i in range(25000): for j in range(max_len - 1): Xtest[i, j * 2] = data2[i, j] Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words indice = np.arange(25000) np.random.shuffle(indice) Xtrain = Xtrain[indice] Ytrain = Ytrain[indice] Xtest = Xtest[indice] Ytest = Ytest[indice] return Xtrain, Ytrain, Xtest, Ytest
def save_model(model, file): """ Save model to the given file (potentially Google storage). :param model: model :param file: output file """ print('Saving model to file {}.'.format(file)) temp_file = 'temp_model_{}.h5'.format(randint(0, 100000000)) model.save(temp_file) try: # copy model to google storage with FileIO(temp_file, mode='rb') as input_f: with FileIO(file, mode='wb') as output_f: output_f.write(input_f.read()) finally: remove(temp_file)
def potential_model(params, **kwargs): """Shortcut for generating potential model from paramters When creating the model, a params.yml is automatically created in model_dir containing network_params and model_params. The potential model can also be initiated with the model_dir, in that case, params.yml must locate in model_dir from which all parameters are loaded Args: params(str or dict): parameter dictionary or the model_dir **kwargs: additional options for the estimator, e.g. config """ import os import yaml from tensorflow.python.lib.io.file_io import FileIO from datetime import datetime if isinstance(params, str): model_dir = params assert tf.gfile.Exists('{}/params.yml'.format(model_dir)),\ "Parameters files not found." with FileIO(os.path.join(model_dir, 'params.yml'), 'r') as f: params = yaml.load(f, Loader=yaml.Loader) else: model_dir = params['model_dir'] yaml.Dumper.ignore_aliases = lambda *args: True to_write = yaml.dump(params) params_path = os.path.join(model_dir, 'params.yml') if not tf.gfile.IsDirectory(model_dir): tf.gfile.MakeDirs(model_dir) if tf.gfile.Exists(params_path): original = FileIO(params_path, 'r').read() if original != to_write: tf.gfile.Rename( params_path, params_path + '.' + datetime.now().strftime('%y%m%d%H%M')) FileIO(params_path, 'w').write(to_write) model = tf.estimator.Estimator(model_fn=_potential_model_fn, params=params, model_dir=model_dir, **kwargs) return model
def __init__(self, filename): self.i2t = {} with FileIO(filename, mode="r") as fio: lines = fio.readlines() for line in lines: line = line.strip(' \r\n\t') datas = line.split('\t') self.i2t[int(datas[0])] = datas[1] print('load idx ', len(self.i2t))
def get_input(): f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+') texts = pickle.load(f) f.close() tokenizer = Tokenizer(nb_words=num_words) tokenizer.fit_on_texts(texts[0:25000]) sequences = tokenizer.texts_to_sequences(texts) sequences_reverse = [list(reversed(seq)) for seq in sequences] x = pad_sequences(sequences, maxlen=max_len) x_reverse=pad_sequences(sequences_reverse, maxlen=max_len) word_index = tokenizer.word_index embeddings_index = {} wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+')) allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+')) for i in range(len(allwords)): embeddings_index[allwords[i]] = wordX[i, :] embedding_matrix = np.zeros((num_words, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None and i < num_words: embedding_matrix[i] = embedding_vector y_train = np.zeros((25000,), dtype=np.float32) y_test = np.zeros((25000,), dtype=np.float32) y_train[12500:25000] = np.ones((12500,), dtype=np.float32) y_test[12500:25000] = np.ones((12500,), dtype=np.float32) x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int) for i in range(50000): for j in range(max_len - 3): x_seq[i, j * 4] = x[i, j] x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2 x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3 x_train_0 = x[:25000] x_train_1 = x_reverse[:25000] x_train_2=x_seq[:25000] x_test_0 = x[25000:] x_test_1 = x_reverse[25000:] x_test_2=x_seq[25000:] result=[] indice = np.arange(25000) np.random.shuffle(indice) result.append(x_train_0[indice]) result.append(x_train_1[indice]) result.append(x_train_2[indice]) result.append(x_test_0[indice]) result.append(x_test_1[indice]) result.append(x_test_2[indice]) result.append(y_train[indice]) result.append(y_test[indice]) result.append(embedding_matrix) return result
def update_datasets(self, filter=None): if filter is None: filter = self._filter close_file = True log.info("Updateing datasets from file list: %s", self._input_source) if hasattr(self._input_source, 'read'): input_file = self._input_source close_file = False elif isinstance(self._input_source, str) and self._input_source.startswith("gs://"): log.info("Using tensorflow for IO") from tensorflow.python.lib.io.file_io import FileIO input_file = FileIO(self._input_source, "r") log.info("Tensorflow reported size: %d", input_file.size()) else: input_file = open(self._input_source) lines = input_file.readlines() for line in lines: fpath = line.strip() parts = fpath.split("/") file_name = parts[-1] match = self._re.match(file_name) if not match: continue match_components = match.groupdict() dataset_path = self._prepend_path + fpath dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path) dataset = self.get_dataset_by_id(dataset_id) if not filter(dataset_id, match_components, dataset): self.remove_dataset_by_id(dataset_id) if close_file: input_file.close()
def save(self): model_json = self.model.to_json() with FileIO("{}/{}.json".format(self.output_path, self.name), "w") as json_file: json_file.write(model_json) fp = "{}.h5".format(self.name) if self.output_path.startswith('gs://'): self.model.save_weights(fp) copy_file_to_gcs(self.output_path, fp) else: self.model.save_weights("{}/{}.h5".format(self.output_path, self.name))
def write_words(word_model, output_file): """Writes the words from a .vec file to an output file of strings. Parameters ---------- word_model : str path to word model file output_file : str path to output file Returns ------- None """ from tensorflow.python.lib.io.file_io import FileIO with FileIO(word_model, 'r') as input_vectors, FileIO(output_file, 'w') as output: for line in input_vectors: split = line.split() if len(split) > 2: word = split[0] output.write(word) output.write("\n")
def load(self, name=""): output_name = self.name if name == "" else name with FileIO("{}/{}.json".format(self.output_path, output_name), "r") as json_file: loaded_model_json = json_file.read() self.model = model_from_json(loaded_model_json) fp = "{}.h5".format(output_name) if self.output_path.startswith('gs://'): copy_file_from_gcs(self.output_path, fp) self.model.load_weights(fp) else: self.model.load_weights("{}/{}.h5".format(self.output_path, output_name))
def copy(source, dest): """ Copy from source to dest, create all necessary dirs. :param source: source file :param dest: dest file """ with FileIO(source, mode='rb') as input_f: if '/' in dest and not isdir(dirname(dest)): makedirs(dirname(dest)) with open(dest, mode='wb') as output_f: while 1: buf = input_f.read(1024 * 1024) if not buf: break output_f.write(buf)
def write_predictions(output_file, tuple_predictions, keys, ground_truth): """ Write predictions to a TSV file. :param output_file: output file :param tuple_predictions: predictions stored in a tuple per sample :param keys: iterable of sample keys (UUIDs) :param ground_truth: ground-truth object that knows the index->label conversion """ if exists(output_file): print('WARNING: Overwriting {}'.format(output_file)) with FileIO(output_file, mode='w') as f: # convert indices to label names using index_to_label for key, indices in zip(keys, tuple_predictions): line = key + ground_truth.to_labels(indices) f.write(line + '\n')
def get_model(): ngram_embed = np.load( FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='r+')) ngram_embedding = np.random.randn(num_ngram + 1, word_dimension) ngram_embedding[1:] = ngram_embed input_1 = Input(shape=(max_len, )) embedding_1 = Embedding(input_dim=num_ngram + 1, output_dim=word_dimension, weights=[ngram_embedding], trainable=True)(input_1) x = GRU(word_dimension)(embedding_1) # x=Bidirectional(GRU(word_dimension),merge_mode='concat')(embedding_1) output_1 = Dense(1, activation='sigmoid')(x) model = Model(inputs=[input_1], outputs=[output_1]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def load_data(path, vocab, pad=32, numfiles=0, lowercase=False): X, Xu = [], [] t2s = Text2Seq(vocab, vocab_is_lowercase=lowercase) files = recursively_list_files(path) for i, fname in enumerate(tqdm(files, ascii=True, mininterval=0.5)): if 0 < numfiles < (i + 1): break # Process at most `numfiles` files # with FileIO(fname, "r", encoding="utf-8") as f: with FileIO(fname, "r") as f: text = f.read() seq, aux = t2s.toseq(text) X.extend(seq) Xu.extend(aux) X.extend([0] * pad) Xu.extend([[0, 0]] * pad) X = np.array(X, dtype="int32") Xu = np.array(Xu, dtype="float32") return X, Xu
def load_data_sequences(path, vocab, seqlen, stride, numfiles=0): XX, YY, XXu, YYu = [], [], [], [] t2s = Text2Seq(vocab) files = recursively_list_files(path) for i, fname in enumerate(tqdm(files, ascii=True)): if 0 < numfiles < (i + 1): break # Process at most `numfiles` files with FileIO(fname, "r") as f: seq, unk = t2s.toseq(f.read()) Xi, Yi = seqwindows(seq, seqlen, stride) Xui, Yui = seqwindows(unk, seqlen, stride, dtype="float32") XX.append(Xi) YY.append(Yi) XXu.append(Xui) YYu.append(Yui) X = np.concatenate(XX) Y = np.concatenate(YY) Xu = np.concatenate(XXu) Yu = np.concatenate(YYu) return X, Y, Xu, Yu
def train(): x_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1)) y_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1)) with tf.device("/cpu:0"): embedding_doc = tf.Variable( tf.random_uniform([num_ngram, 300], -0.5, 0.5)) nce_weights = tf.get_variable('nce_weights_words', [num_words, 300], trainable=True) nce_biases = tf.Variable(tf.zeros([num_words]), trainable=True) input_1 = tf.nn.embedding_lookup(embedding_doc, x_place) input_2 = tf.reshape(input_1, [-1, 300]) loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=y_place, inputs=input_2, num_sampled=num_sampled, num_classes=num_words)) optimizer = tf.train.AdamOptimizer().minimize(loss) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) x, y, embedding_metrix = prepare_data() init_nce = tf.assign(nce_weights, embedding_metrix) sess.run(init_nce) start = 0 for i in range(1000000): x_1, _y, start = get_input(x, y, start) # _loss, _ = sess.run([loss, optimizer], feed_dict={x1_place: x_1, x2_place: x_2, y_place: _y}) _loss, _ = sess.run([loss, optimizer], feed_dict={ x_place: x_1, y_place: _y }) if i % 300 == 0: print(i, " loss ", _loss) np.save( FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='w+'), embedding_doc.eval(sess))
def load_vocab(filename, maxwords=0): """ Load newline-separated words from file to dict mapping them to unique ids. :param maxwords: Max number of words to load. Load all by default. Returns (list of words, word->id map) """ pad = "·" # "<#PAD#>" vocab = dict() words = [] counter = 1 # start off with 1 so that embedding matrix's first vector is zero and second is for unknown words.append(pad) with FileIO(filename, "r") as f: for i, line in enumerate(f): if 0 < maxwords < i + 1: break word = line.strip() words.append(word) vocab[word] = counter counter += 1 return words, vocab