コード例 #1
0
def init_test():
    mode_train, mode_test = 'TR', 'TE'

    dict_obj = set_dict.Dictionary()

    # train object
    params_train = set_params.ParamsClass(mode=mode_train)
    dir_train = set_dir.Directory(mode_train)
    params_train.num_classes = len(dict_obj.label_dict)

    # test object
    params_test = set_params.ParamsClass(mode=mode_test)
    dir_test = set_dir.Directory(mode_test)
    params_test.num_instances, params_test.indices = get_length(dir_test.data_filename)
    params_test.batch_size = 1
    params_test.num_classes = len(dict_obj.label_dict)

    word_emb_path = dir_train.word_embedding
    word_emb_matrix = np.float32(np.genfromtxt(word_emb_path, delimiter=' '))
    params_train.vocab_size = params_test.vocab_size = len(word_emb_matrix)

    print('***** INITIALIZING TF GRAPH *****')

    session = tf.Session()

    with tf.variable_scope("classifier", reuse=None):
        test_obj = model.DeepAttentionClassifier(params_test, dir_test)

    model_saver = tf.train.Saver()
    print('Loading model ...')
    model_saver.restore(session, set_dir.Directory('TE').test_model)

    print('**** MODEL LOADED ****\n')

    return session, test_obj
コード例 #2
0
def generate_indexed_labels():
    label_hash = {}
    input_file = open(set_dir.Directory('TR').label_filename).readlines()
    curr_count = 0
    for each_label in input_file:
        curr_label = each_label.strip()
        if curr_label not in label_hash:
            label_hash[curr_label] = curr_count
            curr_count += 1

    label_map_file = open(set_dir.Directory('TR').label_map_dict, 'wb')
    pickle.dump(label_hash, label_map_file, protocol=pickle.HIGHEST_PROTOCOL)

    print('Total classes %d' % (len(label_hash)))
コード例 #3
0
ファイル: predict_img.py プロジェクト: krayush07/tf-vgg
def main():
    set_dir_obj = set_dir.Directory()
    class_arr = get_imagenet_class()

    with tf.Session() as sess:
        img_vgg_obj = ImagePredictVGG(set_dir_obj.weights_dir)

        test_folder = set_dir_obj.sample_test_dir
        gold_label, images = [], []

        for filename in os.listdir(test_folder):
            img_arr = resize_image(read_image(test_folder + '/' + filename),
                                   img_size=(224, 224))
            images.append(img_arr)
            gold_label.append(filename)

        prob = sess.run(img_vgg_obj.prob,
                        feed_dict={img_vgg_obj.img_batch: np.asarray(images)})
        for idx, each_prob in enumerate(prob):
            max = np.argmax(each_prob)
            plt.text(0.5,
                     -4.5,
                     'Predicted: ' + class_arr[max].strip() + ', Prob.: ' +
                     str(each_prob[max]),
                     fontsize=12)
            plot_image(images[idx])
            print gold_label[idx], max, class_arr[max].strip(), each_prob[max]
コード例 #4
0
def run_epoch(session, eval_op, model_obj, dict_obj, verbose=False):
    print('\nrun epoch')

    output_file = open(set_dir.Directory('TE').log_emb_path + '/word_embedding.csv', 'w')

    params = model_obj.params
    dir_obj = model_obj.dir_obj
    data_filename = dir_obj.data_filename
    label_filename = dir_obj.label_filename

    for step, (input_seq_arr, length_arr, label_arr) \
            in enumerate(reader.data_iterator(params, data_filename, label_filename, model_obj.params.indices, dict_obj)):

        feed_dict = {model_obj.word_input: input_seq_arr,
                     model_obj.seq_length: length_arr,
                     model_obj.label: label_arr}

        emb_matrix, logits, _ = session.run([model_obj.word_emb_matrix,
                                                          model_obj.logits,
                                                          eval_op],
                                                         feed_dict=feed_dict)



        for each_emb in emb_matrix:
            output_file.write(' '.join(str(x) for x in each_emb).strip() + '\n')

        break

    print 'Embedding file written ...'
コード例 #5
0
 def __init__(self, mode='TR'):
     """
     :param mode: 'TR' for train, 'TE' for test, 'VA' for valid
     """
     self.mode = mode
     self.rel_dir = set_dir.Directory(mode)
     # gloveDict = rel_dir.glove_path
     self.word_dict = pickle.load(open(self.rel_dir.glove_present_training_word_vocab, 'rb'))
     self.word_emb = self.rel_dir.word_embedding
     self.glove_present_word_csv = np.float32(np.genfromtxt(self.word_emb, delimiter=' '))
     self.label_dict = pickle.load(open(self.rel_dir.label_map_dict, 'rb'))
コード例 #6
0
    def generate_vocab(self, training_file):
        word_dict = {}
        word_counter = 2
        max_sequence_length = 0

        training_file_pointer = open(training_file, 'r')

        print('\nReading Training File .... ')

        tokenized_training_file = open(self.dataDir + '/tokenized_training',
                                       'w')

        for line in training_file_pointer:
            line = line.rstrip()
            string = re.split(r'\t', line)
            size = len(string)
            tokenized_training_string = ''
            for j in range(size):
                string[j] = re.sub(r'#[0-9]+', r'', string[j].strip())
                # tokenized_sent = nltk.word_tokenize((string[j]).decode('utf-8'))
                # tokenized_sent.append('<eos>')
                # tokenized_sent = string[j].split(" ")
                tokenized_sent = string[j].split(" ")
                tokenized_string = ' '.join(tokenized_sent)
                tokenized_training_string += tokenized_string + '\t'

                for token in tokenized_sent:
                    if (word_dict.has_key(token) == False):
                        word_dict[token] = word_counter
                        word_counter += 1

            # tokenized_training_file.write(tokenized_training_string.encode('utf-8').rstrip('\t'))
            tokenized_training_file.write(
                tokenized_training_string.rstrip('\t'))
            tokenized_training_file.write('\n')
            curr_seq_length = len(tokenized_training_string.split())
            if (curr_seq_length > max_sequence_length):
                max_sequence_length = curr_seq_length

        word_vocab = open(set_dir.Directory('TR').word_vocab_dict, 'wb')

        pickle.dump(word_dict, word_vocab, protocol=cPickle.HIGHEST_PROTOCOL)

        word_vocab.close()
        training_file_pointer.close()
        tokenized_training_file.close()

        print(
            'Reading Completed \n ========================== \n Unique tokens: excluding padding and unkown words %d \n Max. sequence length: %d\n ==========================\n'
            % (word_counter - 2, max_sequence_length))

        # print(word_dict)
        return word_dict
コード例 #7
0
def generate_indexed_labels():
    label_hash = {}
    # input_file = open(set_dir.Directory('TR').label_filename).readlines()
    # curr_count = 0
    # for each_label in input_file:
    #     curr_label = each_label.strip()
    #     if not label_hash.has_key(curr_label):
    #         label_hash[curr_label] = curr_count
    #         curr_count += 1

    label_hash["joy"] = 0
    label_hash["sadness"] = 1
    label_hash["disgust"] = 2
    label_hash["anger"] = 3
    label_hash["fear"] = 4
    label_hash["surprise"] = 5
    label_hash["neutral"] = 6

    label_map_file = open(set_dir.Directory('TR').label_map_dict, 'wb')
    pickle.dump(label_hash, label_map_file, protocol=pickle.HIGHEST_PROTOCOL)

    print 'Total classes %d' % (len(label_hash))
コード例 #8
0
def run_epoch(session, eval_op, model_obj, dict_obj, verbose=False):
    epoch_combined_loss = 0.0
    total_correct = 0.0
    total_instances = 0.0
    print('\nrun epoch')

    output_file = open(set_dir.Directory('TE').test_cost_path, 'w')

    params = model_obj.params
    dir_obj = model_obj.dir_obj
    data_filename = dir_obj.data_filename
    label_filename = dir_obj.label_filename

    for step, (input_seq_arr, length_arr, label_arr) \
            in enumerate(reader.data_iterator(params, data_filename, label_filename, model_obj.params.indices, dict_obj)):

        feed_dict = {
            model_obj.word_input: input_seq_arr,
            model_obj.seq_length: length_arr,
            model_obj.label: label_arr
        }

        loss, prediction, probabilities, _ = session.run([
            model_obj.loss, model_obj.prediction, model_obj.probabilities,
            eval_op
        ],
                                                         feed_dict=feed_dict)

        total_correct += np.sum(prediction == label_arr)
        total_instances += params.batch_size
        epoch_combined_loss += loss

        for each_pred in prediction:
            output_file.write(str(each_pred + 1) + '\n')

    print 'CE loss: %.4f, Total instances: %d, Correct: %d, Accuracy: %.4f' \
          % (epoch_combined_loss, total_instances, total_correct, (total_correct / total_instances) * 100)

    return epoch_combined_loss
コード例 #9
0
import os
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from global_module.settings_module import set_dir
import numpy as np

#TODO: clean code

LOG_DIR = set_dir.Directory('TE').log_emb_path
metadata = LOG_DIR + '/word_metadata.tsv'
wordemb = LOG_DIR + '/word_embedding.csv'
emb = tf.Variable(np.genfromtxt(wordemb), name='word_emb')

with tf.Session() as sess:
    saver = tf.train.Saver([emb])

    sess.run(emb.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'emb.ckpt'))

    config = projector.ProjectorConfig()
    # One can add multiple embeddings.
    embedding = config.embeddings.add()
    embedding.tensor_name = emb.name
    # Link this tensor to its metadata file (e.g. labels).
    embedding.metadata_path = metadata
    # Saves a config file that TensorBoard will read during startup.
    projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)
コード例 #10
0
 def __init__(self):
     self.dataDir = set_dir.Directory('TR').data_path
     self.vocabDir = set_dir.Directory('TR').vocab_path
     self.gloveDict = set_dir.Directory('TR').glove_path
     self.config = set_params.ParamsClass('TR')
コード例 #11
0
 def util(self):
     training_file = set_dir.Directory('TR').data_filename
     self.generate_vocab(training_file)
     vocab_size = self.extract_glove_vectors(
         set_dir.Directory('TR').word_vocab_dict, self.gloveDict)
     return vocab_size
コード例 #12
0
import sys
from global_module.settings_module import set_dir

# root_folder =

dir_obj = set_dir.Directory('TE')
cost_file = open(dir_obj.test_cost_path, 'r')
output = open(dir_obj.test_pred_path, 'w')
output_seq = open(dir_obj.test_seq_op_path, 'w')

test_seq_file = open(dir_obj.data_filename, 'r')

count_iter = 1
min_pos = -1
step_val = 6
pred_ans = ''
pred_goal = ''
pred_slot = ''
multiplier = -1.0  # if probability, 1.0 if cost

min_cost = sys.float_info.max
for costLine, pred_line in zip(cost_file, test_seq_file):
    costLine = costLine.rstrip()
    ans = pred_line.rstrip()

    cost = multiplier * float(costLine)
    if (count_iter < step_val):
        if (min_cost > cost):
            min_cost = cost
            min_pos = count_iter
            pred_ans = ans
コード例 #13
0
ファイル: predict_img.py プロジェクト: krayush07/tf-vgg
def get_imagenet_class():
    file = open(set_dir.Directory().imagenet_class_dir, 'r')
    class_arr = file.readlines()
    return class_arr
コード例 #14
0
def run_train(dict_obj):
    mode_train, mode_valid, mode_all = 'TR', 'VA', 'ALL'

    # train object

    params_train = set_params.ParamsClass(mode=mode_train)
    dir_train = set_dir.Directory(mode_train)
    params_train.num_instances, params_train.indices = get_length(
        dir_train.data_filename)

    # valid object

    params_valid = set_params.ParamsClass(mode=mode_valid)
    dir_valid = set_dir.Directory(mode_valid)
    params_valid.num_instances, params_valid.indices = get_length(
        dir_valid.data_filename)

    params_train.num_classes = params_valid.num_classes = len(
        dict_obj.label_dict)

    if params_train.enable_shuffle:
        random.shuffle(params_train.indices)
        random.shuffle(params_valid.indices)

    min_loss = sys.float_info.max

    word_emb_path = dir_train.word_embedding
    word_emb_matrix = np.float32(np.genfromtxt(word_emb_path, delimiter=' '))
    params_train.vocab_size = params_valid.vocab_size = len(word_emb_matrix)

    print('***** INITIALIZING TF GRAPH *****')

    timestamp = str(int(time.time()))
    train_out_dir = os.path.abspath(
        os.path.join(dir_train.log_path, "train", timestamp))
    valid_out_dir = os.path.abspath(
        os.path.join(dir_train.log_path, "valid", timestamp))
    print("Writing to {}\n".format(train_out_dir))

    with tf.Graph().as_default(), tf.Session() as session:

        # random_normal_initializer = tf.random_normal_initializer()
        # random_uniform_initializer = tf.random_uniform_initializer(-params_train.init_scale, params_train.init_scale)
        xavier_initializer = tf.contrib.layers.xavier_initializer(
            uniform=True, seed=None, dtype=tf.float32)

        with tf.variable_scope("classifier",
                               reuse=None,
                               initializer=xavier_initializer):
            train_obj = model.DeepAttentionClassifier(params_train, dir_train)

        train_writer = tf.summary.FileWriter(train_out_dir, session.graph)
        valid_writer = tf.summary.FileWriter(valid_out_dir)

        if not params_train.enable_checkpoint:
            session.run(tf.global_variables_initializer())

        if params_train.enable_checkpoint:
            ckpt = tf.train.get_checkpoint_state(dir_train.model_path)
            if ckpt and ckpt.model_checkpoint_path:
                print("Loading model from: %s" % ckpt.model_checkpoint_path)
                tf.train.Saver().restore(session, ckpt.model_checkpoint_path)
        elif not params_train.use_random_initializer:
            session.run(
                tf.assign(train_obj.word_emb_matrix,
                          word_emb_matrix,
                          name="word_embedding_matrix"))

        with tf.variable_scope("classifier",
                               reuse=True,
                               initializer=xavier_initializer):
            valid_obj = model.DeepAttentionClassifier(params_valid, dir_valid)

        print('**** TF GRAPH INITIALIZED ****')

        start_time = time.time()
        for i in range(params_train.max_max_epoch):
            lr_decay = params_train.lr_decay**max(i - params_train.max_epoch,
                                                  0.0)
            train_obj.assign_lr(session, params_train.learning_rate * lr_decay)

            # print(params_train.learning_rate * lr_decay)

            print('\n++++++++=========+++++++\n')

            print("Epoch: %d Learning rate: %.5f" %
                  (i + 1, session.run(train_obj.lr)))
            train_loss, _ = run_epoch(session,
                                      train_writer,
                                      train_obj.train_op,
                                      min_loss,
                                      train_obj,
                                      dict_obj,
                                      i,
                                      verbose=True)
            print("Epoch: %d Train loss: %.3f" % (i + 1, train_loss))

            valid_loss, curr_loss = run_epoch(session, valid_writer,
                                              tf.no_op(), min_loss, valid_obj,
                                              dict_obj, i)
            if curr_loss < min_loss:
                min_loss = curr_loss

            print("Epoch: %d Valid loss: %.3f" % (i + 1, valid_loss))

            curr_time = time.time()
            print('1 epoch run takes ' + str(((curr_time - start_time) /
                                              (i + 1)) / 60) + ' minutes.')

        train_writer.close()
        valid_writer.close()
コード例 #15
0
def util():
    raw_training_file = set_dir.Directory('TR').raw_train_path
    training_file = set_dir.Directory('TR').data_filename
    sample_train_file(raw_training_file, training_file,
                      set_params.ParamsClass().sampling_threshold)
コード例 #16
0
def extract_glove_vectors(word_vocab_file, glove_file):
    glove_vocab_dict = pickle.load(open(glove_file, 'rb'))
    word_vocab_dict = pickle.load(open(word_vocab_file, 'rb'))

    length_word_vector = 0

    glove_present_training_word_vocab_dict = collections.OrderedDict()
    glove_present_training_word_counter = 2  # 3
    # glove_present_training_word_counter = 1
    glove_present_word_vector_dict = collections.OrderedDict()

    glove_present_training_word_vocab_dict['PAD'] = 0
    glove_present_training_word_vocab_dict['UNK'] = 1  # 2
    glove_present_word_vector_dict[1] = glove_vocab_dict['food']

    if (length_word_vector == 0):
        length_word_vector = len(glove_vocab_dict['food'].split(' '))

    for key, value in word_vocab_dict.items():

        if (config.all_lowercase):
            if (glove_vocab_dict.has_key(key.lower())):
                key = key.lower()
            elif (glove_vocab_dict.has_key(key)):
                key = key
            elif (glove_vocab_dict.has_key(key.title())):
                key = key.title()
            elif (glove_vocab_dict.has_key(key.upper())):
                key = key.upper()
            else:
                key = key.lower()

        if (key not in glove_present_training_word_vocab_dict):
            if (config.use_unknown_word):
                if (glove_vocab_dict.has_key(key)
                        and config.use_random_initializer == False):
                    if (key != 'UNK'):
                        glove_present_training_word_vocab_dict[
                            key] = glove_present_training_word_counter
                        glove_present_word_vector_dict[
                            glove_present_training_word_counter] = glove_vocab_dict.get(
                                key)
                        glove_present_training_word_counter += 1
                else:
                    glove_present_training_word_vocab_dict[
                        key] = glove_present_training_word_counter
                    vec_str = ''
                    for i in range(length_word_vector):
                        vec_str += str(round(random.uniform(-0.9, 0.9),
                                             6)) + ' '
                    glove_present_word_vector_dict[
                        glove_present_training_word_counter] = vec_str.strip()
                    glove_present_training_word_counter += 1
            elif (glove_vocab_dict.has_key(key)
                  and config.use_random_initializer == False
                  and config.use_unknown_word == False):
                if (key != 'UNK'):
                    glove_present_training_word_vocab_dict[
                        key] = glove_present_training_word_counter
                    glove_present_word_vector_dict[
                        glove_present_training_word_counter] = glove_vocab_dict.get(
                            key)
                    glove_present_training_word_counter += 1
            elif (config.use_random_initializer):
                glove_present_training_word_vocab_dict[
                    key] = glove_present_training_word_counter
                glove_present_word_vector_dict[
                    glove_present_training_word_counter] = glove_vocab_dict.get(
                        'UNK')
                glove_present_training_word_counter += 1
                # else :
                #     print('Error')

    word_vector_file = open(set_dir.Directory('TR').word_embedding, 'w')
    writer = csv.writer(word_vector_file)
    string = ''
    for i in range(length_word_vector):
        string += '0 '
    word_vector_file.write(string.rstrip(' ') + '\n')
    # word_vector_file.write(string.rstrip(' ') + '\n') # zeros vector (id 1)
    for key, value in glove_present_word_vector_dict.items():
        writer.writerow([value])

    glove_present_training_word_vocab = open(
        set_dir.Directory('TR').glove_present_training_word_vocab, 'wb')
    pickle.dump(glove_present_training_word_vocab_dict,
                glove_present_training_word_vocab)

    print(glove_present_training_word_vocab_dict)

    print(
        'Glove_present_unique_training_tokens, Total unique tokens, Glove token size'
    )
    print(len(glove_present_training_word_vocab_dict), len(word_vocab_dict),
          len(glove_vocab_dict))

    word_vector_file.close()

    print('\nVocab Size:')
    # print(len(glove_present_word_vector_dict)+2)
    print(len(glove_present_training_word_vocab_dict))

    glove_present_training_word_vocab.close()
    # return(len(glove_present_word_vector_dict)+2)

    #####
    #   WORD METADATA
    ####
    meta_file = open(dirObj.word_emb_tsv, 'w')
    # meta_file.write('Word' + '\t' + 'Id' + '\n')
    for key, value in glove_present_training_word_vocab_dict.items():
        # meta_file.write(key + '\t' + str(value) + '\n')
        meta_file.write(key + '\n')
    meta_file.close()
    #####

    return len(glove_present_word_vector_dict) + 1
コード例 #17
0
    def extract_glove_vectors(self, word_vocab_file, glove_file):
        glove_vocab_dict = cPickle.load(open(glove_file, 'rb'))
        word_vocab_dict = cPickle.load(open(word_vocab_file, 'rb'))

        length_word_vector = 0

        glove_present_training_word_vocab_dict = {}
        glove_present_training_word_counter = 2  # 3
        # glove_present_training_word_counter = 1
        glove_present_word_vector_dict = {}

        glove_present_training_word_vocab_dict['UNK'] = 1  # 2
        glove_present_word_vector_dict[1] = glove_vocab_dict.get('UNK')

        if (length_word_vector == 0):
            length_word_vector = len(glove_vocab_dict.get('the').split(' '))

        for key, value in word_vocab_dict.items():
            if (self.config.all_lowercase):
                if (glove_vocab_dict.has_key(key.lower())):
                    key = key.lower()
                elif (glove_vocab_dict.has_key(key)):
                    key = key
                elif (glove_vocab_dict.has_key(key.title())):
                    key = key.title()
                elif (glove_vocab_dict.has_key(key.upper())):
                    key = key.upper()
                else:
                    key = key.lower()

            if (not glove_present_training_word_vocab_dict.has_key(key)):
                if (self.config.use_unknown_word):
                    if (glove_vocab_dict.has_key(key)
                            and self.config.use_random_initializer == False):
                        if (key != 'UNK'):
                            glove_present_training_word_vocab_dict[
                                key] = glove_present_training_word_counter
                            glove_present_word_vector_dict[
                                glove_present_training_word_counter] = glove_vocab_dict.get(
                                    key)
                            glove_present_training_word_counter += 1
                    else:
                        glove_present_training_word_vocab_dict[
                            key] = glove_present_training_word_counter
                        vec_str = ''
                        for i in range(length_word_vector):
                            vec_str += str(round(random.uniform(-0.9, 0.9),
                                                 6)) + ' '
                        glove_present_word_vector_dict[
                            glove_present_training_word_counter] = vec_str.strip(
                            )
                        glove_present_training_word_counter += 1
                elif (glove_vocab_dict.has_key(key)
                      and self.config.use_random_initializer == False
                      and self.config.use_unknown_word == False):
                    if (key != 'UNK'):
                        glove_present_training_word_vocab_dict[
                            key] = glove_present_training_word_counter
                        glove_present_word_vector_dict[
                            glove_present_training_word_counter] = glove_vocab_dict.get(
                                key)
                        glove_present_training_word_counter += 1
                elif (self.config.use_random_initializer):
                    glove_present_training_word_vocab_dict[
                        key] = glove_present_training_word_counter
                    glove_present_word_vector_dict[
                        glove_present_training_word_counter] = glove_vocab_dict.get(
                            'UNK')
                    glove_present_training_word_counter += 1
                    # else :
                    #     print('Error')

        word_vector_file = open(set_dir.Directory('TR').word_embedding, 'w')
        writer = csv.writer(word_vector_file)
        string = ''
        for i in range(length_word_vector):
            string += '0 '
        word_vector_file.write(string.rstrip(' ') + '\n')
        # word_vector_file.write(string.rstrip(' ') + '\n') # zeros vector (id 1)
        for key, value in glove_present_word_vector_dict.items():
            writer.writerow([value])

        glove_present_training_word_vocab = open(
            set_dir.Directory('TR').glove_present_training_word_vocab, 'wb')
        pickle.dump(glove_present_training_word_vocab_dict,
                    glove_present_training_word_vocab,
                    protocol=cPickle.HIGHEST_PROTOCOL)

        print(glove_present_training_word_vocab_dict)

        print(
            'Glove_present_unique_training_tokens, Total unique tokens, Glove token size'
        )
        print(len(glove_present_word_vector_dict), len(word_vocab_dict),
              len(glove_vocab_dict))

        word_vector_file.close()

        print('\nVocab Size:')
        # print(len(glove_present_word_vector_dict)+2)
        print(len(glove_present_word_vector_dict) + 1)

        glove_present_training_word_vocab.close()
        # return(len(glove_present_word_vector_dict)+2)
        return (len(glove_present_word_vector_dict) + 1)
コード例 #18
0
 def __init__(self):
     self.glove_dict = cPickle.load(
         open(set_dir.Directory('TR').glove_path, 'rb'))
     self.config = set_params.ParamsClass('TR')
コード例 #19
0
# id = 1 for unkown words

# word_vocab.pkl 	 					-> map of unique training words and ids
# glove_present_training_word_vocab.pkl -> map of unique training words that are present in glove data and their new ids
# word_embedding.csv					-> word embedding corresponding to glove_present_words

import pickle
import csv
import pickle
import random
import re

from global_module.settings_module import set_dir, set_params
import collections

dirObj = set_dir.Directory('TR')
dataDir = dirObj.data_path
vocabDir = dirObj.vocab_path
gloveDict = dirObj.glove_path
config = set_params.ParamsClass('TR')


def generate_vocab(training_file):
    word_dict = collections.OrderedDict()
    word_counter = 2
    max_sequence_length = 0

    training_file_pointer = open(training_file, 'r')

    print('\nReading Training File .... ')
コード例 #20
0
# id = 0 for padding
# id = 1 for unkown words

# word_vocab.pkl 	 					-> map of unique training words and ids
# glove_present_training_word_vocab.pkl -> map of unique training words that are present in glove data and their new ids
# word_embedding.csv					-> word embedding corresponding to glove_present_words

import cPickle
import csv
import pickle
import random
import re

from global_module.settings_module import set_dir, set_params

dataDir = set_dir.Directory('TR').data_path
vocabDir = set_dir.Directory('TR').vocab_path
gloveDict = set_dir.Directory('TR').glove_path
config = set_params.ParamsClass('TR')


def generate_vocab(training_file):
    word_dict = {}
    word_counter = 2
    max_sequence_length = 0

    training_file_pointer = open(training_file, 'r')

    print('\nReading Training File .... ')

    tokenized_training_file = open(dataDir + '/tokenized_training', 'w')
import cPickle as pickle
from global_module.settings_module import set_dir

rel_dir = set_dir.Directory('TR')


def convert(test_filename):
    label_dict = pickle.load(open(rel_dir.label_map_dict, 'rb'))
    test_file = open(test_filename, 'r')
    op_file = open(test_filename + '_output.txt', 'w')

    new_map = {}

    for actual_id, mapped_id in label_dict.iteritems():
        new_map[mapped_id] = actual_id

    for line in test_file:
        line = line.strip()
        op_file.write(new_map[int(line) - 1] + '\n')

    op_file.close()
    test_file.close()

    # convert('/home/aykumar/aykumar_home/self/deep-text-classifier/global_module/utility_dir/folder1/output/dummy_rnn.txt')
コード例 #22
0
def main():
    params = set_params.ParamsClass(mode='TR')
    dir_obj = set_dir.Directory('TR')
    classifier_obj = DeepAttentionClassifier(params, dir_obj)
コード例 #23
0
import cPickle
import re

from global_module.settings_module import set_dir, set_params

glove_dict = cPickle.load(open(set_dir.Directory('TR').glove_path, 'rb'))

config = set_params.ParamsClass('TR')


def sample_train_file(raw_training_file, training_file, threshold):
    raw_training_file_pointer = open(raw_training_file, 'r')
    training_file_pointer = open(training_file, 'w')
    word_dict = {}

    print('\nReading raw training file .... ')

    for line in raw_training_file_pointer:
        line = line.rstrip()
        # line = line.lower()
        string = re.split(r'\t', line)
        size = len(string)
        tokenized_training_string = ''
        for j in range(size):
            # string[j] = re.sub(r'#[0-9]+', r'', string[j].strip())
            tokenized_sent = string[j].split(" ")
            tokenized_string = ' '.join(tokenized_sent)
            tokenized_training_string += tokenized_string + '\t'

            for token in tokenized_sent:
                if token not in word_dict:
コード例 #24
0
def main():
    training_file = set_dir.Directory('TR').data_filename
    word_dict = generate_vocab(training_file)
    vocab_size = extract_glove_vectors(
        set_dir.Directory('TR').word_vocab_dict, gloveDict)
    return vocab_size