Ejemplo n.º 1
0
def prepare_train():
    global tokenizer
    print("prepare training data")
    with FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+') as f:
        texts = pickle.load(f)[:25000]
    with FileIO(os.path.join(FLAGS.buckets, "texts_unsup.pkl"), mode='r+') as f:
        texts += pickle.load(f)

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sum_words = sum([len(seq) for seq in sequence])
    print('there are %d words' % (sum_words))
    x = np.zeros((sum_words, 1), dtype=np.int32)
    y = np.zeros((sum_words, 1), dtype=np.int32)
    index = 0
    for i, seq in enumerate(sequence):
        for s in seq:
            x[index] = i
            y[index] = s
            index += 1
    indice = np.arange(sum_words)
    np.random.shuffle(indice)
    x = x[indice]
    y = y[indice]
    return x, y, sum_words
Ejemplo n.º 2
0
def get_input():
    with FileIO(os.path.join(FLAGS.buckets, "20news/texts.pkl"), mode='r+') as f:
        texts = pickle.load(f)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts[:num_train])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse=[list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse=pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"),mode='r+'))
    allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"),mode='r+'))
    print(len(allwords))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    x_train_0 = x[:num_train]
    x_train_1 = x_reverse[:num_train]
    x_test_0 = x[num_train:]
    x_test_1 = x_reverse[num_train:]
    y_train = np.load(FileIO(os.path.join(FLAGS.buckets, "20news/Ytrain.npy"), mode='r+'))
    y_train = to_categorical(y_train)
    y_test = np.load(FileIO(os.path.join(FLAGS.buckets, "20news/Ytest.npy"), mode='r+'))
    y_test = to_categorical(y_test)

    return x_train_0, x_train_1, y_train, x_test_0, x_test_1, y_test, embedding_matrix
Ejemplo n.º 3
0
def f1():
    try:
        file_path = root_dir + 'data/github/test_codevecs_npy/use.codevecs_0.npy'
        with FileIO(file_path, mode="rb") as fio:
            code_vec = np.load(fio)
            print('读取.npy成功', len(code_vec))
            print(code_vec[0][:10])
    except Exception as e:
        print('读取.npy失败')
        print(e)

    try:
        file_path = root_dir + 'data/github/vocab.apiseq.pkl'
        with FileIO(file_path, mode="rb") as fio:
            api_seq_vocab = pk.load(fio)
            print('读取.pkl成功', len(api_seq_vocab))
            print(list(api_seq_vocab.keys())[:3])
    except Exception as e:
        print('读取.pkl失败')
        print(e)

    try:
        file_path = root_dir + 'data/github/use.search.txt'
        with FileIO(file_path, mode="r") as fio:
            lines = fio.readlines()
            print('读取.txt成功')
            print(lines[0])
            print(lines[1])
            print(lines[2])
    except Exception as e:
        print('读取.txt失败')
        print(e)
Ejemplo n.º 4
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()
Ejemplo n.º 5
0
def get_input():
    with FileIO(os.path.join(FLAGS.buckets, 'imdb/texts.pkl'), 'r+') as f:
        texts = pickle.load(f)
    word_index = pickle.load(
        FileIO(os.path.join(FLAGS.buckets, "word_index.pkl"), mode='r+'))
    ngram_index = pickle.load(
        FileIO(os.path.join(FLAGS.buckets, "ngram_index.pkl"), mode='r+'))

    sequence = []
    for sentence in texts:
        t_s = []
        for token in sentence.split(' '):
            if token in word_index:
                t_s.append(str(word_index[token]))
        sequence.append(t_s)

    new_sequence = []
    for seq in sequence:
        t_s = []
        for i in range(len(seq) - 2):
            s = '_'.join(seq[i:i + 3])
            if s in ngram_index and ngram_index[s] <= num_ngram:
                t_s.append(ngram_index[s])
        new_sequence.append(t_s)
    new_sequence = pad_sequences(new_sequence, maxlen=max_len)

    x_train = new_sequence[:25000]
    x_test = new_sequence[25000:]
    y_train = np.zeros((25000, ), dtype=np.float32)
    y_test = np.zeros((25000, ), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.float32)

    return x_train, y_train, x_test, y_test
Ejemplo n.º 6
0
def f3():
    a = np.array([5, 4, 3, 2, 1])
    file_path = root_dir + 'a.npy'
    with FileIO(file_path, mode="wb") as fio:
        np.save(fio, a)
    with FileIO(file_path, mode="rb") as fio:
        code_vec = np.load(fio)
    print('读取.npy成功', code_vec)
Ejemplo n.º 7
0
def get_input():
    with FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+') as f:
        texts = pickle.load(f)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse = pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(
        FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"),
               mode='r+'))
    allwords = pickle.load(
        FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    print(len(allwords))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    x_train_0 = x[:25000]
    x_train_1 = x_reverse[:25000]
    x_test_0 = x[25000:]
    x_test_1 = x_reverse[25000:]
    y_train = np.zeros((25000, ), dtype=np.float32)
    y_test = np.zeros((25000, ), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.float32)

    indice = np.arange(25000)
    np.random.shuffle(indice)
    x_train_0 = x_train_0[indice]
    x_test_0 = x_test_0[indice]
    x_train_1 = x_train_1[indice]
    x_test_1 = x_test_1[indice]
    y_train = y_train[indice]
    y_test = y_test[indice]

    result = []
    result.append(x_train_0)
    result.append(x_train_1)
    result.append(x_test_0)
    result.append(x_test_1)
    result.append(y_train)
    result.append(y_test)
    result.append(embedding_matrix)
    return result
Ejemplo n.º 8
0
def is_database_created(username):
    filename = "{}.csv".format(username)
    file_exists = exists_in_gcp(filename)
    if file_exists:
        with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'r') as f:
            DATABASES[username] = pd.read_csv(f)
    else:
        DATABASES[username] = pd.DataFrame(
            columns=["username", "date", "cause", "spent"])
        with FileIO(os.path.join("gs://", BUCKET_NAME, filename), 'w') as f:
            DATABASES[username].to_csv(f)
    return not file_exists
Ejemplo n.º 9
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    # sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    sequences = []
    for i in range(50000):
        t = []
        tokens = texts[i].lower().split(' ')
        for j in range(len(tokens)):
            index = word_index.get(tokens[j], 0)
            if index < num_words:
                t.append(index)
            else:
                t.append(0)
        sequences.append(t)

    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 3):
            Xtrain[i, j * 4] = data1[i, j]
            Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words
            Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2
            Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3
    for i in range(25000):
        for j in range(max_len - 3):
            Xtest[i, j * 4] = data2[i, j]
            Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words
            Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2
            Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
Ejemplo n.º 10
0
def prepare_train():
    print("prepare training data")
    f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb')
    text1 = pickle.load(f)
    text1 = text1[:25000]
    f.close()
    f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb')
    text2 = pickle.load(f)
    f.close()
    texts = text1 + text2

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH + 1,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')
    seq_len = []
    for i in range(len(sequence)):
        r = len(sequence[i])
        if r < MAX_DOCUMENT_LENGTH:
            seq_len.append(r)
        else:
            seq_len.append(MAX_DOCUMENT_LENGTH)
    x_1 = sequence_pad[:, :-1]

    y_ = sequence_pad[:, 1:]
    return x_1, seq_len, y_
Ejemplo n.º 11
0
def write_tfrecord(fname, dataset, log_every=100, pre_fn=None):
    """Helper function to convert dataset object into tfrecord file.

    fname must end with .yml or .yaml.
    The data will be written in a .tfr file with the same suffix.

    Args:
        dataset (Dataset): input dataset.
        fname (str): filename of the dataset to be saved.
    """
    def _bytes_feature(value):
        """Returns a bytes_list from a string / byte."""
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    # Preperation
    tfr = '.'.join(fname.split('.')[:-1] + ['tfr'])
    writer = tf.python_io.TFRecordWriter(tfr)
    tensors = dataset.make_one_shot_iterator().get_next()
    if pre_fn:
        tensors = pre_fn(tensors)
        dataset = dataset.map(pre_fn)
    types = dataset.output_types
    shapes = dataset.output_shapes
    # Sanity check
    assert (type(types) == dict and all(type(v) != dict for v in types.values())),\
        "Only dataset of non-nested dictionary is supported."
    assert fname.endswith('.yml'), "Filename must end with .yml."
    serialized = {k: tf.serialize_tensor(v) for k, v in tensors.items()}
    sess = tf.Session()
    # Writing Loop
    n_parsed = 0
    try:
        while True:
            features = {}
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    key: _bytes_feature(val)
                    for key, val in sess.run(serialized).items()
                }))
            writer.write(example.SerializeToString())
            n_parsed += 1
            if n_parsed % log_every == 0:
                sys.stdout.write('\r {} samples written to {} ...'.format(
                    n_parsed, tfr))
                sys.stdout.flush()
    except tf.errors.OutOfRangeError:
        print('\r {} samples written to {}, done.'.format(n_parsed, tfr))
        sess.close()
        writer.close()
    # Write metadata
    format_dict = {
        k: {
            'dtype': types[k].name,
            'shape': shapes[k].as_list()
        }
        for k in types.keys()
    }
    info_dict = {'n_sample': n_parsed}
    with FileIO(fname, 'w') as f:
        yaml.safe_dump({'format': format_dict, 'info': info_dict}, f)
Ejemplo n.º 12
0
def load_embeddings(vocab, dim, filename):
    """
    Load a subset of embedding vectors from file corresponding to vocabulary provided.
    Args:
        vocab: string->int map from words to their ids (id corresponds to vector's row in the resulting embedding
             matrix). All ids > 0.
        dim: embedding vector dimension
        filename: file where each line is a word followed by `dim` floats, all space-separated

    Returns:
        MxN = (len(vocab)+1) x dim numpy embedding matrix.
        The +1 for M is because 0th vector is a zero vector for padding.
    """
    em = np.zeros((len(vocab) + 1, dim), dtype="float32")

    # with FileIO(filename, "r", encoding="utf-8") as f:
    with FileIO(filename, "r") as f:
        for linenum, line in enumerate(f):
            line = unidecode(line)
            idx = line.find(' ')
            if idx < 0:
                print("malformed line, no space found: line", linenum)
                continue
            word = line[:idx]
            if word not in vocab:
                continue
            i = vocab[word]

            em[i, :] = np.array(line.strip().split()[1:], dtype="float32")

    return em
Ejemplo n.º 13
0
def load_tfrecord(fname):
    """Load tfrecord dataset.

    Args:
       fname (str): filename of the .yml metadata file to be loaded.
       dtypes (dict): dtype of dataset.
    """
    # dataset
    with FileIO(fname, 'r') as f:
        format_dict = (yaml.safe_load(f)['format'])
    dtypes = {k: format_dict[k]['dtype'] for k in format_dict.keys()}
    shapes = {k: format_dict[k]['shape'] for k in format_dict.keys()}

    feature_dict = {k: tf.FixedLenFeature([], tf.string) for k in dtypes}

    def parser(example):
        return tf.parse_single_example(example, feature_dict)

    def converter(tensors):
        tensors = {
            k: tf.parse_tensor(v, dtypes[k])
            for k, v in tensors.items()
        }
        [v.set_shape(shapes[k]) for k, v in tensors.items()]
        return tensors

    tfr = '.'.join(fname.split('.')[:-1] + ['tfr'])
    dataset = tf.data.TFRecordDataset(tfr).map(parser).map(converter)
    return dataset
Ejemplo n.º 14
0
def fnn_model():
    with FileIO(os.path.join(FLAGS.buckets, "docembed.npy"), 'r+') as f:
        x = np.load(f)
    x_train = x[:25000]
    x_test = x[25000:]
    y_train = np.zeros((25000, ), dtype=np.float32)
    y_test = np.zeros((25000, ), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500, ), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500, ), dtype=np.float32)

    indice = np.arange(25000)
    np.random.shuffle(indice)
    x_train = x_train[indice]
    x_test = x_test[indice]
    y_train = y_train[indice]
    y_test = y_test[indice]

    x_place = tf.placeholder(dtype=tf.float32, shape=(None, HIDDEN_SIZE))
    y_place = tf.placeholder(dtype=tf.int64, shape=(None, ))
    # out1 = tf.layers.dense(x_place, 100, activation=None)
    # out2=tf.nn.relu(out1)
    out3 = tf.layers.dense(x_place, 2, activation=None)
    output = tf.nn.softmax(out3)
    predicted_classes = tf.argmax(output, 1)

    a = tf.cast(tf.equal(y_place, predicted_classes), tf.float32)
    accuracy = tf.reduce_mean(a)
    onehot_labels = tf.one_hot(y_place, 2, 1, 0)
    loss = tf.losses.mean_squared_error(onehot_labels, output)

    train_op = tf.train.AdamOptimizer().minimize(loss)
    # train_op = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss)
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    for i in range(10000):
        x, y = get_input(x_train, y_train)
        _loss, _acc, _ = sess.run([loss, accuracy, train_op],
                                  feed_dict={
                                      x_place: x,
                                      y_place: y
                                  })
        if i % 100 == 0:
            print("iter: %d loss: %f accuracy: %f" % (i, _loss, _acc), )
        if i % 500 == 0:
            sum_acc = 0
            sum_loss = 0
            for j in range(25):
                _val_loss, _val_acc = sess.run(
                    [loss, accuracy],
                    feed_dict={
                        x_place: x_test[1000 * j:1000 * (j + 1)],
                        y_place: y_test[1000 * j:1000 * (j + 1)]
                    })
                sum_acc += _val_acc
                sum_loss += _val_loss
            print('val acc:', sum_acc / 25, 'val loss: ', _val_loss / 25)
    sess.close()
Ejemplo n.º 15
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 1):
            Xtrain[i, j * 2] = data1[i, j]
            Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words
    for i in range(25000):
        for j in range(max_len - 1):
            Xtest[i, j * 2] = data2[i, j]
            Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
Ejemplo n.º 16
0
def save_model(model, file):
    """
    Save model to the given file (potentially Google storage).

    :param model: model
    :param file: output file
    """
    print('Saving model to file {}.'.format(file))
    temp_file = 'temp_model_{}.h5'.format(randint(0, 100000000))
    model.save(temp_file)
    try:
        # copy model to google storage
        with FileIO(temp_file, mode='rb') as input_f:
            with FileIO(file, mode='wb') as output_f:
                output_f.write(input_f.read())
    finally:
        remove(temp_file)
Ejemplo n.º 17
0
def potential_model(params, **kwargs):
    """Shortcut for generating potential model from paramters

    When creating the model, a params.yml is automatically created 
    in model_dir containing network_params and model_params.

    The potential model can also be initiated with the model_dir, 
    in that case, params.yml must locate in model_dir from which
    all parameters are loaded

    Args:
        params(str or dict): parameter dictionary or the model_dir
        **kwargs: additional options for the estimator, e.g. config
    """
    import os
    import yaml
    from tensorflow.python.lib.io.file_io import FileIO
    from datetime import datetime

    if isinstance(params, str):
        model_dir = params
        assert tf.gfile.Exists('{}/params.yml'.format(model_dir)),\
            "Parameters files not found."
        with FileIO(os.path.join(model_dir, 'params.yml'), 'r') as f:
            params = yaml.load(f, Loader=yaml.Loader)
    else:
        model_dir = params['model_dir']
        yaml.Dumper.ignore_aliases = lambda *args: True
        to_write = yaml.dump(params)
        params_path = os.path.join(model_dir, 'params.yml')
        if not tf.gfile.IsDirectory(model_dir):
            tf.gfile.MakeDirs(model_dir)
        if tf.gfile.Exists(params_path):
            original = FileIO(params_path, 'r').read()
            if original != to_write:
                tf.gfile.Rename(
                    params_path,
                    params_path + '.' + datetime.now().strftime('%y%m%d%H%M'))
        FileIO(params_path, 'w').write(to_write)

    model = tf.estimator.Estimator(model_fn=_potential_model_fn,
                                   params=params,
                                   model_dir=model_dir,
                                   **kwargs)
    return model
Ejemplo n.º 18
0
 def __init__(self, filename):
     self.i2t = {}
     with FileIO(filename, mode="r") as fio:
         lines = fio.readlines()
         for line in lines:
             line = line.strip(' \r\n\t')
             datas = line.split('\t')
             self.i2t[int(datas[0])] = datas[1]
     print('load idx ', len(self.i2t))
Ejemplo n.º 19
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse=pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+'))
    allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    y_train = np.zeros((25000,), dtype=np.float32)
    y_test = np.zeros((25000,), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500,), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500,), dtype=np.float32)

    x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int)
    for i in range(50000):
        for j in range(max_len - 3):
            x_seq[i, j * 4] = x[i, j]
            x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words
            x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2
            x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3

    x_train_0 = x[:25000]
    x_train_1 = x_reverse[:25000]
    x_train_2=x_seq[:25000]
    x_test_0 = x[25000:]
    x_test_1 = x_reverse[25000:]
    x_test_2=x_seq[25000:]

    result=[]

    indice = np.arange(25000)
    np.random.shuffle(indice)
    result.append(x_train_0[indice])
    result.append(x_train_1[indice])
    result.append(x_train_2[indice])
    result.append(x_test_0[indice])
    result.append(x_test_1[indice])
    result.append(x_test_2[indice])
    result.append(y_train[indice])
    result.append(y_test[indice])
    
    result.append(embedding_matrix)
    return result
Ejemplo n.º 20
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        close_file = True
        log.info("Updateing datasets from file list: %s", self._input_source)
        if hasattr(self._input_source, 'read'):
            input_file = self._input_source
            close_file = False
        elif isinstance(self._input_source,
                        str) and self._input_source.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._input_source, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._input_source)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        if close_file:
            input_file.close()
Ejemplo n.º 21
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()
Ejemplo n.º 22
0
    def save(self):
        model_json = self.model.to_json()
        with FileIO("{}/{}.json".format(self.output_path, self.name),
                    "w") as json_file:
            json_file.write(model_json)

        fp = "{}.h5".format(self.name)

        if self.output_path.startswith('gs://'):
            self.model.save_weights(fp)
            copy_file_to_gcs(self.output_path, fp)
        else:
            self.model.save_weights("{}/{}.h5".format(self.output_path,
                                                      self.name))
Ejemplo n.º 23
0
def write_words(word_model, output_file):
    """Writes the words from a .vec file to an output file of strings.

    Parameters
    ----------
    word_model : str
        path to word model file
    output_file : str
        path to output file

    Returns
    -------
    None
    """
    from tensorflow.python.lib.io.file_io import FileIO
    with FileIO(word_model, 'r') as input_vectors, FileIO(output_file,
                                                          'w') as output:
        for line in input_vectors:
            split = line.split()
            if len(split) > 2:
                word = split[0]
                output.write(word)
                output.write("\n")
Ejemplo n.º 24
0
    def load(self, name=""):
        output_name = self.name if name == "" else name
        with FileIO("{}/{}.json".format(self.output_path, output_name),
                    "r") as json_file:
            loaded_model_json = json_file.read()
        self.model = model_from_json(loaded_model_json)

        fp = "{}.h5".format(output_name)

        if self.output_path.startswith('gs://'):
            copy_file_from_gcs(self.output_path, fp)
            self.model.load_weights(fp)
        else:
            self.model.load_weights("{}/{}.h5".format(self.output_path,
                                                      output_name))
Ejemplo n.º 25
0
def copy(source, dest):
    """
    Copy from source to dest, create all necessary dirs.

    :param source: source file
    :param dest: dest file
    """
    with FileIO(source, mode='rb') as input_f:
        if '/' in dest and not isdir(dirname(dest)):
            makedirs(dirname(dest))
        with open(dest, mode='wb') as output_f:
            while 1:
                buf = input_f.read(1024 * 1024)
                if not buf:
                    break
                output_f.write(buf)
Ejemplo n.º 26
0
def write_predictions(output_file, tuple_predictions, keys, ground_truth):
    """
    Write predictions to a TSV file.

    :param output_file: output file
    :param tuple_predictions: predictions stored in a tuple per sample
    :param keys: iterable of sample keys (UUIDs)
    :param ground_truth: ground-truth object that knows the index->label conversion
    """
    if exists(output_file):
        print('WARNING: Overwriting {}'.format(output_file))

    with FileIO(output_file, mode='w') as f:
        # convert indices to label names using index_to_label
        for key, indices in zip(keys, tuple_predictions):
            line = key + ground_truth.to_labels(indices)
            f.write(line + '\n')
Ejemplo n.º 27
0
def get_model():
    ngram_embed = np.load(
        FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='r+'))
    ngram_embedding = np.random.randn(num_ngram + 1, word_dimension)
    ngram_embedding[1:] = ngram_embed

    input_1 = Input(shape=(max_len, ))
    embedding_1 = Embedding(input_dim=num_ngram + 1,
                            output_dim=word_dimension,
                            weights=[ngram_embedding],
                            trainable=True)(input_1)
    x = GRU(word_dimension)(embedding_1)
    # x=Bidirectional(GRU(word_dimension),merge_mode='concat')(embedding_1)
    output_1 = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[input_1], outputs=[output_1])
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model
Ejemplo n.º 28
0
def load_data(path, vocab, pad=32, numfiles=0, lowercase=False):
    X, Xu = [], []
    t2s = Text2Seq(vocab, vocab_is_lowercase=lowercase)
    files = recursively_list_files(path)
    for i, fname in enumerate(tqdm(files, ascii=True, mininterval=0.5)):
        if 0 < numfiles < (i + 1):
            break  # Process at most `numfiles` files
        # with FileIO(fname, "r", encoding="utf-8") as f:
        with FileIO(fname, "r") as f:
            text = f.read()
            seq, aux = t2s.toseq(text)
            X.extend(seq)
            Xu.extend(aux)
            X.extend([0] * pad)
            Xu.extend([[0, 0]] * pad)

    X = np.array(X, dtype="int32")
    Xu = np.array(Xu, dtype="float32")
    return X, Xu
Ejemplo n.º 29
0
def load_data_sequences(path, vocab, seqlen, stride, numfiles=0):
    XX, YY, XXu, YYu = [], [], [], []
    t2s = Text2Seq(vocab)
    files = recursively_list_files(path)
    for i, fname in enumerate(tqdm(files, ascii=True)):
        if 0 < numfiles < (i + 1):
            break  # Process at most `numfiles` files
        with FileIO(fname, "r") as f:
            seq, unk = t2s.toseq(f.read())
            Xi, Yi = seqwindows(seq, seqlen, stride)
            Xui, Yui = seqwindows(unk, seqlen, stride, dtype="float32")
            XX.append(Xi)
            YY.append(Yi)
            XXu.append(Xui)
            YYu.append(Yui)
    X = np.concatenate(XX)
    Y = np.concatenate(YY)
    Xu = np.concatenate(XXu)
    Yu = np.concatenate(YYu)
    return X, Y, Xu, Yu
Ejemplo n.º 30
0
def train():
    x_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1))
    y_place = tf.placeholder(dtype=tf.int64, shape=(batch_size, 1))
    with tf.device("/cpu:0"):
        embedding_doc = tf.Variable(
            tf.random_uniform([num_ngram, 300], -0.5, 0.5))
        nce_weights = tf.get_variable('nce_weights_words', [num_words, 300],
                                      trainable=True)
        nce_biases = tf.Variable(tf.zeros([num_words]), trainable=True)

        input_1 = tf.nn.embedding_lookup(embedding_doc, x_place)
    input_2 = tf.reshape(input_1, [-1, 300])
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=y_place,
                       inputs=input_2,
                       num_sampled=num_sampled,
                       num_classes=num_words))
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)
    x, y, embedding_metrix = prepare_data()
    init_nce = tf.assign(nce_weights, embedding_metrix)
    sess.run(init_nce)
    start = 0
    for i in range(1000000):
        x_1, _y, start = get_input(x, y, start)
        # _loss, _ = sess.run([loss, optimizer], feed_dict={x1_place: x_1, x2_place: x_2, y_place: _y})
        _loss, _ = sess.run([loss, optimizer],
                            feed_dict={
                                x_place: x_1,
                                y_place: _y
                            })
        if i % 300 == 0:
            print(i, " loss ", _loss)
    np.save(
        FileIO(os.path.join(FLAGS.buckets, "ngram_embedding.npy"), mode='w+'),
        embedding_doc.eval(sess))
Ejemplo n.º 31
0
def load_vocab(filename, maxwords=0):
    """
    Load newline-separated words from file to dict mapping them to unique ids.
    :param maxwords: Max number of words to load. Load all by default.
    Returns (list of words, word->id map)
    """
    pad = "·"  # "<#PAD#>"
    vocab = dict()
    words = []
    counter = 1  # start off with 1 so that embedding matrix's first vector is zero and second is for unknown
    words.append(pad)
    with FileIO(filename, "r") as f:
        for i, line in enumerate(f):
            if 0 < maxwords < i + 1:
                break
            word = line.strip()
            words.append(word)
            vocab[word] = counter
            counter += 1

    return words, vocab