Beispiel #1
0
def main(args):
    train_data = get_data(args.path_train)
    dev_data = get_data(args.path_dev)
    test_data = get_data(args.path_test)

    word_to_int,int_to_word,sz1, \
    sz2,sz3=get_vocab(train_data,args.t1,args.t2)
    train_data = tokenize(train_data, word_to_int)
    dev_data = tokenize(dev_data, word_to_int)
    test_data = tokenize(test_data, word_to_int)

    train_data = batchify(train_data, args.batch_size)
    dev_data = batchify(dev_data, args.dev_batch_size)
    test_data = batchify(test_data, args.dev_batch_size)

    train_data = train_data.cuda()
    dev_data = dev_data.cuda()
    test_data = test_data.cuda()

    vocab_sizes = [sz1, sz2, sz3]
    g_list = [int(2**(args.N - i)) for i in range(args.N)]
    model = AWD_LSTM(args.embed_dim, args.h_size, args.layers, vocab_sizes,
                     args.m, args.k, args.N, g_list, args.n_layers,
                     args.dropouti, args.dropouth, args.dropout,
                     args.dropout_embed, args.dropoutw).cuda()

    train(dev_data, train_data, model, args)
    test_loss, test_perplexity = validate(test_data, model)
    print('test loss=', test_loss, 'test perplexity=', test_perplexity)
Beispiel #2
0
    def fit(self, train_data):
        # TO DO: Learn the parameters from the training data
        data = get_feature_vectors(train_data[0], binary=True)
        lab = train_data[1]

        c = Counter(lab)
        pos_count = c[1]
        neg_count = c[-1]

        total = pos_count + neg_count

        self.pos_prob = float(pos_count) / total
        self.neg_prob = float(neg_count) / total

        stop_indices = []
        current_vocab = get_vocab()
        for word in self.stop_words:
            if word in current_vocab:
                stop_indices.append(current_vocab[word])

        for i in range(0, len(data)):  # movie reviews
            for j in range(0, len(data[i])):  # dictionary
                if j not in stop_indices:
                    if lab[i] == 1:
                        self.pos_word[j] += data[i][j]
                    elif lab[i] == -1:
                        self.neg_word[j] += data[i][j]

        for i in range(0, len(self.pos_word)):
            self.pos_word[i] = (float(self.pos_word[i]) +
                                1) / (pos_count + self.vector_size)
        for i in range(0, len(self.neg_word)):
            self.neg_word[i] = (float(self.neg_word[i]) +
                                1) / (neg_count + self.vector_size)
Beispiel #3
0
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        self.acmodel = ACModel(obs_space,
                               action_space,
                               use_memory=use_memory,
                               use_text=use_text)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
Beispiel #4
0
def NN():

    # Read training data
    train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile(
        'train.csv')
    '''
    Implement your Neural Network classifier here
    '''
    word2index = get_vocab(train_tweet_id2text)
    data_dict = combine_vec(word2index,
                            train_tweet_id2text,
                            train_tweet_id2author_label,
                            train_tweet_id2issue,
                            bow=False)

    data = []
    labels = []
    for k in data_dict:
        data.append(data_dict[k])
        labels.append(int(train_tweet_id2label[k]))
    data = np.array(data)
    labels = np.array(labels)
    n_class = len(set(labels))
    n_sample, n_feature = np.shape(data)
    print(np.shape(data))

    lrates = [0.2, 0.4, 0.9]
    all_loss = []
    for r in lrates:
        model = LogRegression(n_feature, n_class, lrate=r, verbose=True)
        train_loss = model.fit(data, labels, max_iter=500)
        print(len(train_loss))
        all_loss.append(train_loss)
    file_name = 'train_loss_nn.pdf'
    plot_lr(lrates, all_loss, file_name)
Beispiel #5
0
def grid_search():
    embedding_sizes = [32, 64]
    learning_rates = [0.15, 0.10]
    window_sizes = [2, 3]

    word_to_ix, ix_to_word, subsampled_words = u.get_vocab(
        vocab_path="../resources/vocab.txt",
        antivocab_path="../resources/antivocab.txt")

    for e_size in embedding_sizes:
        for lr in learning_rates:
            for w_size in window_sizes:
                tf.reset_default_graph()

                train_basic_w2v(dataset="../resources/eurosense_sentences.txt",
                                word_to_ix=word_to_ix,
                                subsampled_words=subsampled_words,
                                model_path="../resources/models",
                                model_ID="basic_w2v_E%d_LR%.3f_W%d" %
                                (e_size, lr, w_size),
                                epochs=30,
                                batch_size=64,
                                embedding_size=e_size,
                                lr=lr,
                                window_size=w_size,
                                neg_samples=16,
                                csv_export=False)
 def __init__(self, vocab_file):
     self.map = utils.get_vocab(vocab_file)
     self.inv_map = {v: k for k, v in self.map.items()}
     self.bos = self.map['<s>']
     self.eos = self.map['</s>']
     self.unk = self.map['<UNK>']
     self.pad = self.map['<PAD>']
Beispiel #7
0
    def __init__(self, trainloader, valloader, config):
        """Initialize configurations."""
        # Data loader.
        self.trainloader = trainloader
        self.validloader = valloader

        # Directories.
        self.main_dir = config.main_dir
        self.model_name = config.model_name

        # Dataset.
        self.data_name = config.data_name

        # Model configurations.
        self.vocab = get_vocab(self.main_dir, self.data_name)
        self.D = config.D
        self.D_prime = config.D_prime
        self.d = config.d
        self.K = config.K
        self.rnn_num = config.rnn_num
        self.margin = config.margin
        self.pt_path = config.pt_path

        # Training configurations.
        self.mode = config.mode
        self.batch_size = config.batch_size
        self.img_size = config.img_size
        self.crop_size = config.crop_size
        self.lr = config.lr  # 0.001
        self.lr_decay = config.lr_decay  # 0.98
        self.init_ep = config.init_ep
        self.max_ep = config.max_ep

        # Miscellaneous.
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.log_step = config.log_step
        self.draw_step = config.draw_step
        self.acc_step = config.acc_step
        self.save_step = config.save_step
        self.use_visdom = config.use_visdom
        self.init_from = config.init_from
        self.best_r = np.array([0, 0, 0])
        self.best_ri = np.array([0, 0, 0])

        # Build model.
        self.build_model()

        if self.use_visdom:
            self.viz = Visdom()
            self.loss_plot = create_vis(self.viz, self.model_name, 'loss',
                                        self.max_ep, 5)
            self.acc_plot_i2t = create_vis(self.viz, self.model_name,
                                           'accuracy', self.max_ep, 100)
            self.acc_plot_t2i = create_vis(self.viz, self.model_name,
                                           'accuracy', self.max_ep, 100)
Beispiel #8
0
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 seed,
                 n_columns,
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        #self.acmodel = PNNModel(obs_space, action_space, use_memory=use_memory, use_text=use_text)
        self.acmodel = PNNModel(obs_space,
                                action_space,
                                use_memory=use_memory,
                                use_text=use_text,
                                use_pnn=True,
                                base=None)

        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size)

        # Add a new column to the model
        for _ in range(n_columns):
            self.acmodel.base.new_task()
        # Load model parameters for PREVIOUS columns
        #for i in range(n_columns - 1):
        #utils.pnn_load_state_dict(self.acmodel, i, pnn_paths[i])
        # Freeze the weights all previous columns
        #acmodel.base.freeze_columns(skip=[args.n_columns - 1])
        # load CURRENT column model parameter for resuming training
        #if "model_state" in status:
        # acmodel.base.columns[args.n_columns - 1].load_state_dict(status["model_state"])
        #status_path = utils.get_status_path(model_dir, args.seed)
        #utils.pnn_load_state_dict(acmodel, args.n_columns - 1, status_path)
        # Load model parameters for all columns
        status_path = utils.get_status_path(model_dir, seed)
        for i in range(n_columns - 1):
            utils.pnn_load_state_dict(self.acmodel, i, status_path)

        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(
                utils.get_vocab(model_dir, seed))
def LR():

    # Read training data
    train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv')

    '''
    Implement your Logistic Regression classifier here
    '''
    BOW = True
    GLOVE = False
    word2index = get_vocab(train_tweet_id2text)
    data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue,bow=True)

    data = []
    labels = []
    for k in data_dict:
        data.append(data_dict[k])
        labels.append(int(train_tweet_id2label[k]))
    n_class = len(set(labels))
    print('dataset shape:',np.shape(data))
    # print(np.shape(labels))
    
    # print(len(word2index))
    n_sample,n_feature = np.shape(data)

    model = LogRegression(n_feature,n_class,lrate=0.8,verbose=True)
    model.fit(data,labels,max_iter=500)
    # y_pred = [model.predict(x) for x in data]

    # Read test data
    test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv')
    
    # Predict test data by learned model

    '''
    Replace the following random predictor by your prediction function.
    '''
    test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = True)

    for tweet_id in test_tweet_id2text:
        # Get the text
        # text=test_tweet_id2text[tweet_id]
        
        # Predict the label
        test_x = test_data_dict[tweet_id]
        label = model.predict(test_x)

        # Store it in the dictionary
        test_tweet_id2label[tweet_id] = label

    # Save predicted labels in 'test_lr.csv'
    SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_lr.csv')
Beispiel #10
0
    def _load_caps(self):
        with open(self.caps_path) as fd:
            #print "Loading the captions..."
            self.caption_dict = pkl.load(fd)
            #print "Done"

        self.vocab, self.mapping = utils.get_vocab(self.caption_dict,
                                                   remove_stop_words=False)
        #print "We have a vocabulary of size", len(self.vocab)

        if self.process_text:
            #print "processing the text..."
            self.process_captions()

        print "Done"
Beispiel #11
0
def cv_NN(kfold):
    # Read training data
    train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile(
        'train.csv')
    '''
    Implement your Neural Network classifier here
    '''
    word2index = get_vocab(train_tweet_id2text)
    data_dict = combine_vec(word2index,
                            train_tweet_id2text,
                            train_tweet_id2author_label,
                            train_tweet_id2issue,
                            bow=False)

    data = []
    labels = []
    for k in data_dict:
        data.append(data_dict[k])
        labels.append(int(train_tweet_id2label[k]))
    data = np.array(data)
    labels = np.array(labels)
    n_class = len(set(labels))
    n_sample, n_feature = np.shape(data)
    print('Cross validation for Neural network')
    n_sample, n_feature = np.shape(data)
    fold_size = int(np.ceil(n_sample / kfold))
    print('Fold size:', fold_size)
    accuracy = []
    for k in range(kfold):
        tstart = k * fold_size
        tend = min(n_sample, tstart + fold_size)
        training_x = np.array(
            [x for i, x in enumerate(data) if not (tstart <= i and i < tend)])
        test_x = np.array(
            [x for i, x in enumerate(data) if (tstart <= i and i < tend)])
        training_y = np.array([
            x for i, x in enumerate(labels) if not (tstart <= i and i < tend)
        ])
        test_y = np.array(
            [x for i, x in enumerate(labels) if (tstart <= i and i < tend)])
        model = NeuralNet(n_feature, n_class, lrate=0.9, verbose=False)
        model.fit(training_x, training_y, max_iter=500)
        accuracy.append(model.score(test_x, test_y))
        print('Fold', k, 'accuracy', accuracy[-1])
    print('Mean accuracy', np.mean(accuracy))
Beispiel #12
0
def evaluate(target_words, top_k=10, synaware_w2v=True):
    """
    Provides a qualitative measure for the embeddings the model has learned by printing the most similar words to the
    ones provided as test words.
    :param target_words: Test words to discover the closest words to them, as List
    :param top_k: Number of closest words
    :param synaware_w2v: True: use a SynsetAwareWord2Vec model (default); False: use a basic Word2Vec model
    :return: None
    """

    print("Loading vocabularies...")
    word_to_ix, ix_to_word, subsampled_words = u.get_vocab(vocab_path="../resources/vocab.txt",
                                                           antivocab_path="../resources/antivocab.txt")

    print("Creating model...")
    if not synaware_w2v:
        model = Word2Vec(subsampled_words=subsampled_words,
                         vocabulary_size=len(word_to_ix),
                         embedding_size=EMBEDDING_SIZE,
                         learning_rate=LEARNING_RATE,
                         window_size=WINDOW_SIZE,
                         neg_samples=NEG_SAMPLES)
    else:
        model = SynsetAwareWord2Vec(subsampled_words=subsampled_words,
                                    vocabulary_size=len(word_to_ix),
                                    embedding_size=EMBEDDING_SIZE,
                                    learning_rate=LEARNING_RATE,
                                    window_size=WINDOW_SIZE,
                                    neg_samples=NEG_SAMPLES)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        print("Loading model...")
        saver.restore(sess, MODEL_PATH_SYN_W2V if synaware_w2v else MODEL_PATH_W2V)

        target_words = [word_to_ix[w] for w in target_words if w in word_to_ix]
        sim_val = sess.run(model.similarity,
                           feed_dict={model.data["sim_test"]: target_words})

        for i in range(len(target_words)):
            print("Closest %d words to %s:" % (top_k, ix_to_word[target_words[i]]))
            closest_words = (-sim_val[i, :]).argsort()[1:top_k + 1]
            for j in range(top_k):
                word = ix_to_word[closest_words[j]]
                print("\t%d. %s" % (j+1, word))
Beispiel #13
0
 def __init__(self, config, transform, mode):
     self.main_dir = config.main_dir
     self.data_name = config.data_name
     self.mode = mode
     self.max_token_len = config.max_token_len
     self.split = self.get_split_type(config)
     self.data = self.get_data_list()
     self.vocab = get_vocab(self.main_dir, self.data_name)
     self.transform = transform
     ''' debugging '''
     self.coco_split = config.coco_split
     tmp = pth(
         self.main_dir, self.data_name, 'annotations',
         'captions_{}{}.json'.format(
             self.mode, '2014' if self.coco_split is 'rval' else '2017'))
     print('caption path: {}'.format(tmp))
     ''''''
     self.caps = self.get_coco()
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 device=None,
                 argmax=False,
                 num_envs=1):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        self.model = QModel(obs_space, action_space)
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        self.model.load_state_dict(utils.get_model_state(model_dir))
        self.model.to(self.device)
        self.model.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
Beispiel #15
0
def fasttext_train(
        trained_model_dir: OutputDirectory(type='AnyDirectory'),
        training_data_dir: InputDirectory(type='AnyDirectory') = None,
        validation_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None,
        epochs=1,
        batch_size=64,
        learning_rate=0.0005,
        embedding_dim=128

):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    c2i = get_vocab(char2index_dir)
    class_ = get_classs()
    max_len_ = 38
    n_class_ = len(class_)
    vocab_size_ = len(c2i)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    path = os.path.join(training_data_dir, 'train.txt')
    train_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir)
    path = os.path.join(validation_data_dir, 'dev.txt')
    dev_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir)

    train_iter = DataIter(train_samples, batch_size)
    dev_iter = DataIter(dev_samples, batch_size)

    model = FastText(vocab_size=vocab_size_, n_class=n_class_, embed_dim=embedding_dim)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nspent time: %.2f sec' % (end - start))
    print('============================================')
def NN():

    # Read training data
    train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv')

    '''
    Implement your Neural Network classifier here
    '''
    word2index = get_vocab(train_tweet_id2text)
    data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue, bow = False)

    data = []
    labels = []
    for k in data_dict:
        data.append(data_dict[k])
        labels.append(int(train_tweet_id2label[k]))
    data = np.array(data)
    labels = np.array(labels)
    n_class = len(set(labels))
    n_sample,n_feature = np.shape(data)
    print(np.shape(data))

    model = NeuralNet(n_feature,n_class,lrate=0.9,verbose=True)
    model.fit(data,labels,max_iter=800)

    # Read test data
    test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv')
    '''
    Replace the following random predictor by your prediction function.
    '''
    test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = False)

    for tweet_id in test_tweet_id2text:
        # Predict the label
        test_x = test_data_dict[tweet_id]
        label = model.predict(test_x)

        # Store it in the dictionary
        test_tweet_id2label[tweet_id] = label

    # Save predicted labels in 'test_lr.csv'
    SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_nn.csv')
Beispiel #17
0
def regular_choice(update, context):
	"""Given a choice user made store it in
	the user_data dictionary to use it later."""

	ind = int(update.callback_query.data) % 100
	vocab_name = screen_texts[ind].split('_')[-1]
	data = get_vocab(vocab_name)[:8]
	#context.user_data['choice'] = text
	buttons = [
		[InlineKeyboardButton(text=data[x], callback_data=str(150+x)) for x in range(2)],
		[InlineKeyboardButton(text=data[x+2], callback_data=str(152+x)) for x in range(2)],
		[InlineKeyboardButton(text=data[x+4], callback_data=str(154+x)) for x in range(2)],
		[InlineKeyboardButton(text=data[x+6], callback_data=str(156+x)) for x in range(2)]
	]
	keyboard = InlineKeyboardMarkup(buttons)
	text = 'Here are the most popular {} people look for'.format(vocab_name)
	update.callback_query.answer()
	update.callback_query.edit_message_text(text=text, reply_markup=keyboard)

	return d['typing_reply']
Beispiel #18
0
def textrank_keywords(processed_sentences, window_size, top_num):
    """
    Inspired by pagerank, textrank considers each word as a node,
    give the weight to each edge by calculating word window pairs.
    And rank words by their score.
    :param processed_sentences: processed sentences, at least remove stopwords.
    :param window_size: the number of words following a word.
    :param top_num: the number of top words.
    :return: a list of Top top_num words (index 0) with their scores (index 1).
    """
    vocab = get_vocab(processed_sentences)
    token_pairs = get_token_pairs(window_size, processed_sentences)
    # Get normalized matrix
    g = get_matrix(vocab, token_pairs)
    # Initionlization for weight(pagerank value)
    pr = np.array([1] * len(vocab))
    d = 0.85  # damping coefficient, usually is .85
    min_diff = 1e-5  # convergence threshold
    steps = 10
    node_weight = None  # save keywords and its weight
    # Iteration
    previous_pr = 0
    for epoch in range(steps):
        pr = (1 - d) + d * np.dot(g, pr)
        if abs(previous_pr - sum(pr)) < min_diff:
            break
        else:
            previous_pr = sum(pr)
    # Get weight for each node
    node_weight = dict()
    for word, index in vocab.items():
        node_weight[word] = pr[index]
    # Print Top Keywords
    node_weight = OrderedDict(
        sorted(node_weight.items(), key=lambda t: t[1], reverse=True))
    keywords = []
    for i, (key, value) in enumerate(node_weight.items()):
        keywords.append((key, value))
        if i > (top_num - 2):
            break
    return keywords
Beispiel #19
0
def train(args):
    base_path = '../tweet_lm_data'
    train_file = os.path.join(base_path, 'train.txt')
    valid_file = os.path.join(base_path, 'valid.txt')
    vocab_file = os.path.join(base_path, 'tweet_vocab_thresh3.txt')

    # Grab vocabulary
    assert os.path.exists(vocab_file)
    vocab = utils.get_vocab(vocab_file)
    print('vocab length:', len(vocab))
    print('highest word id:', max(vocab.values()))
    print('lowest word id:', min(vocab.values()))
    model_name = ''

    nn_object = BidirectionalLM(args, vocab=vocab, train_file=train_file,
                                valid_file=valid_file, model_name='')
    print('model instantiated')
    nn_object.train()
    
    nn_object.model.save('bidi_lm_final.h5')
    print('Model Saved')
Beispiel #20
0
def predict(epoch_idx, logger=None):
    """Load model in `models` and predict."""
    device = torch.device(
        "cuda" if torch.cuda.is_available() and USE_CUDA else "cpu")

    checkpoint_path = os.path.join(MODEL_DIR,
                                   "model_epoch_{}.ckpt".format(epoch_idx))
    model = torch.load(checkpoint_path, map_location="cpu")
    model.to(device)

    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    model.eval()

    vocab2idx = get_vocab(result_dir=RESULT_DIR, min_count=MIN_COUNT)
    X, _ = load_data(PREDICT_FILE,
                     max_len=MAX_LEN,
                     vocab2idx=vocab2idx,
                     do_lower_case=DO_LOWER_CASE,
                     text_col_name=TEXT_COL_NAME)
    X = torch.from_numpy(X)  # (N, L)
    dataset = TensorDataset(X)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE)
    y_pred = []
    print("Start predicting...")
    tic = time.time()
    for (batch_xs, ) in loader:
        batch_xs = batch_xs.to(device)  # (N, L)
        batch_out = model(batch_xs)  # (N, num_classes)
        batch_pred = batch_out.argmax(dim=-1)  # (N, )
        for i in batch_pred.cpu().numpy():
            y_pred.append(i)
    toc = time.time()
    logger.info("predict use time {}s".format(toc - tic))

    with open(os.path.join(RESULT_DIR, "predict.txt"), "w",
              encoding="utf-8") as fw:
        for i in y_pred:
            fw.write(str(CLASS_NAMES[i]) + "\n")
Beispiel #21
0
    def __init__(self,
                 obs_space,
                 action_space,
                 model_dir,
                 model_name='AC',
                 device=None,
                 argmax=False,
                 num_envs=1,
                 use_memory=False,
                 use_text=False,
                 input_type="image",
                 feature_learn="curiosity"):
        obs_space, self.preprocess_obss = utils.get_obss_preprocessor(
            obs_space)
        if model_name == 'ac':
            self.acmodel = ACModel(obs_space,
                                   action_space,
                                   use_memory=use_memory,
                                   use_text=use_text)
        elif model_name == 'sr':
            self.acmodel = SRModel(obs_space,
                                   action_space,
                                   input_type=input_type,
                                   use_memory=use_memory,
                                   use_text=use_text,
                                   feature_learn=feature_learn)
        self.model_name = model_name
        self.device = device
        self.argmax = argmax
        self.num_envs = num_envs

        if self.acmodel.recurrent:
            self.memories = torch.zeros(self.num_envs,
                                        self.acmodel.memory_size)

        self.acmodel.load_state_dict(utils.get_model_state(model_dir))
        self.acmodel.to(self.device)
        self.acmodel.eval()
        if hasattr(self.preprocess_obss, "vocab"):
            self.preprocess_obss.vocab.load_vocab(utils.get_vocab(model_dir))
def creat_train_table(seed=531):
    # 配布されたcsvファイルを読み込む
    train_df = pd.read_csv('../input/dataset/train/annotations.csv')

    vocab = utils.get_vocab()
    with open('../input/vocab/rarity.json', 'r') as f:
        rarity = json.load(f)

    train_list = Parallel(n_jobs=-1)([
        delayed(process_train)(row, vocab, rarity)
        for index, row in tqdm(train_df.iterrows(), total=len(train_df))
    ])

    meta = pd.DataFrame(train_list).sort_values('ID')
    skf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

    for k, (train_index,
            val_index) in enumerate(skf.split(meta.index, meta.rarity)):
        meta.loc[val_index, 'valid'] = k
    meta = meta.set_index('ID')
    meta.to_csv('../input/tables/meta-train.csv')
    drop_columns = ['height', 'width', 'aspect', 'rarity']
    meta.drop(drop_columns, axis=1).to_csv('../input/tables/train.csv')
Beispiel #23
0
def LR():

    # Read training data
    train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile(
        'train.csv')
    '''
    Implement your Logistic Regression classifier here
    '''
    BOW = True
    GLOVE = False
    word2index = get_vocab(train_tweet_id2text)
    data_dict = combine_vec(word2index,
                            train_tweet_id2text,
                            train_tweet_id2author_label,
                            train_tweet_id2issue,
                            bow=True)

    data = []
    labels = []
    for k in data_dict:
        data.append(data_dict[k])
        labels.append(int(train_tweet_id2label[k]))
    n_class = len(set(labels))
    print(np.shape(data))
    # print(np.shape(labels))

    # print(len(word2index))
    n_sample, n_feature = np.shape(data)
    lrates = [0.2, 0.5, 0.8]
    all_loss = []
    for r in lrates:
        model = LogRegression(n_feature, n_class, lrate=r, verbose=True)
        train_loss = model.fit(data, labels, max_iter=200)
        print(len(train_loss))
        all_loss.append(train_loss)
    file_name = 'train_loss_lr.pdf'
    plot_lr(lrates, all_loss, file_name)
def creat_character_table(seed=531):

    vocab = utils.get_vocab()
    with open('../input/vocab/rarity.json', 'r') as f:
        rarity = json.load(f)

    image_paths = glob.glob(
        os.path.join('../input/dataset/train_kana/U+*/*.jpg'))
    char_list = Parallel(n_jobs=-1)([
        delayed(process_char)(path, vocab, rarity)
        for path in tqdm(image_paths, total=len(image_paths))
    ])

    meta = pd.DataFrame(char_list).sort_values('target')
    skf = StratifiedKFold(n_splits=5, random_state=seed,
                          shuffle=True)  #split変えても良さそう
    for k, (train_index,
            val_index) in enumerate(skf.split(meta.index, meta.rarity)):
        meta.loc[val_index, 'valid'] = k

    meta = meta.set_index('file')
    meta.to_csv('../input/tables/meta-character.csv')
    drop_columns = ['height', 'width', 'aspect', 'rarity']
    meta.drop(drop_columns, axis=1).to_csv('../input/tables/character.csv')
Beispiel #25
0
                                subsampled_words=subsampled_words,
                                model_path="../resources/models",
                                model_ID="basic_w2v_E%d_LR%.3f_W%d" %
                                (e_size, lr, w_size),
                                epochs=30,
                                batch_size=64,
                                embedding_size=e_size,
                                lr=lr,
                                window_size=w_size,
                                neg_samples=16,
                                csv_export=False)


if __name__ == "__main__":
    word_to_ix, ix_to_word, subsampled_words = u.get_vocab(
        vocab_path="../resources/vocab.txt",
        antivocab_path="../resources/antivocab.txt")

    syn_to_ix = u.get_synset_vocab(word_to_ix)

    #grid_search()

    tf.reset_default_graph()
    #train_basic_w2v(dataset="../resources/eurosense_sentences.txt",
    #                word_to_ix=word_to_ix,
    #                subsampled_words=subsampled_words,
    #                model_path="../resources/models",
    #                model_ID="basic_w2v",
    #                epochs=30,
    #                batch_size=64,
    #                embedding_size=64,
Beispiel #26
0
def test_get_vocab(X, n_words, expected):
    result = utils.get_vocab(X, n_words=n_words)
    assert result == expected
Beispiel #27
0
        neg.append(dd)
    elif dd['class'] == 1:
        pos.append(dd)
    else:
        raise Exception('no class!')
shuffle(pos)
shuffle(neg)

num_pos_to_sample = int(NUMTRAIN * prop_value)
num_neg_to_sample = NUMTRAIN - num_pos_to_sample
selected_docs = pos[0:num_pos_to_sample] + neg[0:num_neg_to_sample]
assert len(selected_docs) == NUMTRAIN

#(2) get the vocab for that training proportions
traindicts = []
trainY = []
class2wordcount = defaultdict(list)
for dd in selected_docs:
    traindicts.append(dd['counts'].copy())
    cc = dd['class']
    trainY.append(cc)
    class2wordcount[cc].append(sum(dd['counts'].values()))
trainY = np.array(trainY)

trainX, word2num = utils.get_vocab(traindicts)

#save these
scipy.sparse.save_npz(PATH + 'trainX', trainX)
np.save(PATH + 'trainY', trainY)
w1 = open(PATH + 'word2num.json', 'w')
json.dump(word2num, w1)
Beispiel #28
0
    coherence /= len(top_n)

    print('| NPMI score: {:f}'.format(coherence))


def get_top_words(beta, vocab_bow, top_k):
    topic_words = []
    for k, beta_k in enumerate(beta):
        words = [vocab_bow[idx] for idx in np.argsort(beta_k)[:-top_k-1:-1]]
        topic_words.append(words)
        print('Topic {}: {}'.format(k+1, ' '.join(words)))
    return topic_words


if __name__ == '__main__':
    vocab = utils.get_vocab('./data/StackOverflow/StackOverflow.vocab')
    # vocab = utils.get_vocab('./data/Snippets/Snippets.vocab')

    # with codecs.open('./StackOverflow/nvctm_train_theta', 'rb') as fp:
    with codecs.open('./Snippets/cr_nvctm_train_theta', 'rb') as fp:
        theta = pickle.load(fp)
    fp.close()

    # with codecs.open('./StackOverflow/nvctm_train_beta', 'rb') as fp:
    with codecs.open('./Snippets/cr_nvctm_train_beta', 'rb') as fp:
        beta = pickle.load(fp)
    fp.close()

    tw = get_top_words(beta, vocab, 15)

    test_mat = data_set('./data/StackOverflow/train.feat', 22956)
#
# The first delicate issue we need to address is the vocabulary for our model:
#
# * As indicated in the figure above, the first thing we do when processing an example is look up the words in an embedding (a VSM), which has to have a fixed dimensionality.
#
# * We can use our training data to specify the vocabulary for this embedding; at prediction time, though, we will inevitably encounter words we haven't seen before.
#
# * The convention we adopt here is to map them to an `$UNK` token that is in our pre-specified vocabulary.
#
# * At the same time, we might want to collapse infrequent tokens into `$UNK` to make optimization easier.
#
# In `utils`, the function `get_vocab` implements these strategies. Now we can extract the training vocab and use it for the model embedding, secure in the knowledge that we will be able to process tokens outside of this set (by mapping them to `$UNK`).

# In[20]:

sst_full_train_vocab = utils.get_vocab(X_rnn_train)

# In[21]:

print("sst_full_train_vocab has {:,} items".format(len(sst_full_train_vocab)))

# This frankly seems too big relative to our dataset size. Let's restrict to just 10000 words:

# In[22]:

sst_train_vocab = utils.get_vocab(X_rnn_train, n_words=10000)

# ### Pure NumPy RNN implementation
#
# The first implementation we'll look at is a pure NumPy implementation of exactly the model depicted above. This implementation is a bit slow and might not be all that effective, but it is useful to have available in case one really wants to inspect the details of how these models process examples.
Beispiel #30
0
    def Predict(self, treebanks, datasplit, options):
        char_map = {}
        if options.char_map_file:
            char_map_fh = codecs.open(options.char_map_file,encoding='utf-8')
            char_map = json.loads(char_map_fh.read())
        # should probably use a namedtuple in get_vocab to make this prettier
        _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map)

        # get external embeddings for the set of words and chars in the
        # test vocab but not in the training vocab
        test_embeddings = defaultdict(lambda: {})
        if options.word_emb_size > 0 and options.ext_word_emb_file:
            new_test_words = \
                    set(test_words) - self.feature_extractor.words.viewkeys()

            print "Number of OOV word types at test time: %i (out of %i)" % (
                len(new_test_words), len(test_words))

            if len(new_test_words) > 0:
                # no point loading embeddings if there are no words to look for
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=new_test_words
                    )
                    test_embeddings["words"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["words"]:
                        print "External embeddings found for %i words "\
                                "(out of %i)" % \
                                (len(test_embeddings["words"]), len(new_test_words))

        if options.char_emb_size > 0:
            new_test_chars = \
                    set(test_chars) - self.feature_extractor.chars.viewkeys()
            print "Number of OOV char types at test time: %i (out of %i)" % (
                len(new_test_chars), len(test_chars))

            if len(new_test_chars) > 0:
                for lang in test_langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=new_test_chars,
                        chars=True
                    )
                    test_embeddings["chars"].update(embeddings)
                    if len(test_langs) > 1 and test_embeddings["chars"]:
                        print "External embeddings found for %i chars "\
                                "(out of %i)" % \
                                (len(test_embeddings["chars"]), len(new_test_chars))

        data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map)
        for iSentence, osentence in enumerate(data,1):
            sentence = deepcopy(osentence)
            self.feature_extractor.Init(options)
            conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)]
            self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings)

            scores, exprs = self.__evaluate(conll_sentence, True)
            if self.proj:
                heads = decoder.parse_proj(scores)
                #LATTICE solution to multiple roots
                # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py
                ## ADD for handling multi-roots problem
                rootHead = [head for head in heads if head==0]
                if len(rootHead) != 1:
                    print "it has multi-root, changing it for heading first root for other roots"
                    rootHead = [seq for seq, head in enumerate(heads) if head == 0]
                    for seq in rootHead[1:]:heads[seq] = rootHead[0]
                ## finish to multi-roots

            else:
                heads = chuliu_edmonds_one_root(scores.T)

            for entry, head in zip(conll_sentence, heads):
                entry.pred_parent_id = head
                entry.pred_relation = '_'

            if self.labelsFlag:
                for modifier, head in enumerate(heads[1:]):
                    scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1)
                    conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]]

            dy.renew_cg()

            #keep in memory the information we need, not all the vectors
            oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)]
            for tok_o, tok in zip(oconll_sentence, conll_sentence):
                tok_o.pred_relation = tok.pred_relation
                tok_o.pred_parent_id = tok.pred_parent_id
            yield osentence