def nbow():
    ''' NBOW baseline '''
    WV_CORPUS = "origin"

    embeddings, word_indices = get_embeddings(
        corpus=WV_CORPUS, dim=embedding_dim)

    train_set = SemEvalDataLoader(verbose=False, ekphrasis=True).get_data(task="A",
                                                                          years=None,
                                                                          datasets=None,
                                                                          only_semEval=True)
    test_data = SemEvalDataLoader(
        verbose=False, ekphrasis=True).get_gold(task="A")
    X = [obs[1] for obs in train_set]
    y = [label2id[obs[0]] for obs in train_set]

    X_test = [obs[1] for obs in test_data]
    y_test = [label2id[obs[0]] for obs in test_data]

    task = 'clf'
    print("-----------------------------")
    if task == 'clf':
        print('LogisticRegression')
    else:
        print("LinearSVC")

    bow = bow_model(task)
    bow.fit(X, y)
    predict = bow.predict(X_test)
    results = eval_clf(predict, y_test)
    for res, val in results.items():
        print("{}: {:.3f}".format(res, val))
    load_result_f1(predict, y_test)

    nbow = nbow_model(task, embeddings, word_indices)
    nbow.fit(X, y)
    predict = nbow.predict(X_test)
    results = eval_clf(predict, y_test)
    for res, val in results.items():
        print("{}: {:.3f}".format(res, val))
    load_result_f1(predict, y_test)
    print("-----------------------------")
Esempio n. 2
0
def main():
  with open(CONFIG) as reader:
    config = yaml.safe_load(reader)
  gamefiles = glob(join(config['main']['games_path'], '*.ulx'))
  print('Found {} games.'.format(len(gamefiles)))
  # pprint(gamefiles)
  # Pick a game.
  gamefile = gamefiles[1]

  requested_infos = EnvInfos(
      admissible_commands=True,
      command_templates=True,
      description=True,
      entities=True,
      has_lost=True,
      has_won=True,
      inventory=True,
      max_score=True,
      objective=True,
      verbs=True,
      extras=[
          "recipe",
      ],
  )
  env_id = textworld.gym.register_games([gamefile], requested_infos)
  env_id = textworld.gym.make_batch(
      env_id,
      batch_size=config['main']['environment_batch_size'],
      parallel=True)
  env = gym.make(env_id)

  agent = CustomizableAgent(config, *get_embeddings(config['main']))

  play(env, agent, config['main'])
  play(env, agent, config['main'], evaluation=True)

  agent.cleanup()
  return
    util.print_flag('Loading')
    util.print_flag('Dataset', big=False)
    corpus: ColumnCorpus = ColumnCorpus(data_folder='resources/data/',
                                        train_file='concat_PharmaCoNER.conll',
                                        dev_file=None,
                                        test_file='test_PharmaCoNER.conll',
                                        column_format={
                                            0: 'text',
                                            1: 'begin',
                                            2: 'end',
                                            3: 'ner'
                                        })

    util.print_flag('Embeddings', big=False)
    pooling_op = 'min'
    embeddings: StackedEmbeddings = util.get_embeddings(pooling_op)

    util.print_flag('Training')
    tag_type = 'ner'
    model = f'PharmaCoNER-PCE_{pooling_op}-BPEmb-FT-w2v'
    tagger: SequenceTagger = SequenceTagger(
        embeddings=embeddings,
        tag_dictionary=corpus.make_tag_dictionary(tag_type=tag_type),
        tag_type=tag_type,
        hidden_size=256,
        rnn_layers=1,
        dropout=0.0)
    print(tagger)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    trainer.train(f'resources/models/{model}',
Esempio n. 4
0
def main(args):
    # Load word embedding matrix and char embedding matrix
    word_emb_path = os.path.join(args.data_dir, args.word_emb_file)
    word_emb_matrix, word2id = util.get_embeddings(word_emb_path,
                                                   args.word_emb_size, 97572,
                                                   'word')
    print('Got {} word embeddings'.format(len(word2id)))

    char_emb_path = os.path.join(args.data_dir, args.char_emb_file)
    char_emb_matrix, char2id = util.get_embeddings(char_emb_path,
                                                   args.char_emb_size, 94,
                                                   'char')
    print('Got {} char embeddings'.format(len(char2id)))

    bio2id = {'B': B_IN_BIO, 'I': I_IN_BIO, 'O': O_IN_BIO}

    for phase in ('train', 'dev'):
        # Read lines from downloaded files
        src_list, bio_list, tgt_list = load_data(args.data_dir, phase=phase)
        print('Read {} lines for the {} set'.format(len(src_list), phase))
        assert len(src_list) == len(bio_list) and len(bio_list) == len(tgt_list),\
            'src({}), bio({}), tgt({})'.format(len(src_list), len(bio_list), len(tgt_list))

        # Set up for mapping examples to word/char IDs
        n = len(src_list)
        max_c_len = args.max_c_len if phase == 'train' else args.max_c_len_test
        max_q_len = args.max_q_len if phase == 'train' else args.max_q_len_test
        max_w_len = args.max_w_len

        # Create empty arrays of padding
        src_ids = np.full((n, max_c_len), PAD_ID, dtype=np.int32)
        src_c_ids = np.full((n, max_c_len, max_w_len), PAD_ID, dtype=np.int32)
        bio_ids = np.full((n, max_c_len), O_IN_BIO, dtype=np.int32)
        tgt_ids = np.full((n, max_q_len), PAD_ID, dtype=np.int32)
        tgt_c_ids = np.full((n, max_q_len, max_w_len), PAD_ID, dtype=np.int32)

        # Fill arrays with IDs
        for i, (src, bio,
                tgt) in tqdm(enumerate(zip(src_list, bio_list, tgt_list)),
                             total=n):
            src_words = src.split()[:max_c_len]
            src_ids[i, :len(src_words)] = [
                word2id.get(w, UNK_ID) for w in src_words
            ]
            src_chars = [[c for c in s] for s in src_words]
            for j, chars in enumerate(src_chars):
                chars = chars[:max_w_len]
                src_c_ids[i, j, :len(chars)] = [
                    char2id.get(c, UNK_ID) for c in chars
                ]

            bio_words = bio.split()[:max_c_len]
            bio_ids[i, :len(bio_words)] = [bio2id[w] for w in bio_words]

            tgt_words = tgt.split()[:max_q_len]
            tgt_ids[i, :len(tgt_words)] = [
                word2id.get(w, UNK_ID) for w in tgt_words
            ]
            tgt_chars = [[c for c in s] for s in tgt_words]
            for j, chars in enumerate(tgt_chars):
                chars = chars[:max_w_len]
                src_c_ids[i, j, :len(chars)] = [
                    char2id.get(c, UNK_ID) for c in chars
                ]

        # Save arrays filled with IDs
        with h5py.File(os.path.join(args.data_dir, 'data.hdf5'),
                       'a') as hdf5_fh:
            phase_group = hdf5_fh.create_group(phase)
            phase_group.create_dataset('src_ids'.format(phase),
                                       data=src_ids,
                                       chunks=True)
            phase_group.create_dataset('src_c_ids'.format(phase),
                                       data=src_c_ids,
                                       chunks=True)
            phase_group.create_dataset('bio_ids'.format(phase),
                                       data=bio_ids,
                                       chunks=True)
            phase_group.create_dataset('tgt_ids'.format(phase),
                                       data=tgt_ids,
                                       chunks=True)

        # Save embedding matrices
        word_emb_path = os.path.join(args.data_dir, 'word_embs.npy')
        np.save(word_emb_path, word_emb_matrix)

        char_emb_path = os.path.join(args.data_dir, 'char_embs.npy')
        np.save(char_emb_path, char_emb_matrix)
Esempio n. 5
0
def reply_to_message(**payload):
    data = payload['data']

    if 'user' not in data or ut.is_bot(data['user'], BOT_ID):
        return

    webclient = payload['web_client']
    user = data['user']
    channel = data['channel']
    message = data['text']

    if ut.is_public(channel) and not ut.is_bot_tagged(message, BOT_ID_REGEX):
        return
    with open(str(pd.datetime.now().date()) + '.txt', 'a') as logFile:
        logFile.write(
            str(pd.datetime.now()) + ' \t user : '******'  Text: ' +
            str(message) + '\n')

    message = ut.get_clean_message(message, BOT_ID_REGEX)

    #    if ut.is_single_word(message):
    #        if message in single_word_set:
    #            reply = ' '.join(single_word_set[message])
    #            ut.send_reply(user, webclient, channel, reply)
    #            returnx
    if len(message) < 2:
        ut.send_reply(user, webclient, channel,
                      " Hi ! Please ask Me Detailed Questions ")
        return

    message_embedding = ut.get_embeddings(model, [message])

    top_index, top_scores = ut.get_top_replies(embedding_matrix,
                                               message_embedding)
    top_replies = [answers[ind] for ind in top_index]
    top_q = [questions[ind] for ind in top_index]

    if top_replies[0] == '-1':
        ut.send_reply(user, webclient, channel, SYLLABUS)
        return

    if top_scores[0] < 0.5:
        with open('Unanswered.csv', 'a') as f:
            f.write(str(message) + ',\n')
        ut.send_reply(
            user, webclient, channel,
            'Sorry, I didn\'t get that!\nPlease elaborate your question')

    elif top_scores[0] > 0.5 and top_scores[0] < 0.6:
        with open('Unanswered.csv', 'a') as f:
            f.write(str(message) + ',\n')
        if (len(top_q[0]) < 9):
            ut.send_reply(user, webclient, channel, top_replies[0])
        else:
            reply = '\n*I Found These Matching Queries, Please see if it answers your question, else try elaborating your Question.*  \n'
            for index in range(len(top_q)):
                reply = reply + '\n' + str(index + 1) + '.  ' + top_q[
                    index] + ' \n  ' + top_replies[index] + ' \n'
            reply = reply + '\n' + '*If your Query is still unanswered  please reach out to your college faculty*'
            ut.send_reply(user, webclient, channel, reply)

    else:
        ut.send_reply(user, webclient, channel, top_replies[0])
Esempio n. 6
0
# Import the dataset
dataset = pd.read_csv('./QA.csv', header=None, encoding="utf-8")

# Split dataset into questions and answers
questions = dataset.iloc[:, 0].values
answers = dataset.iloc[:, 1].values

# Open Unanswered questions CSV for logging
#`UQ = open('Unanswered.csv', 'w')
print('############ trying to load model ############')
# Load Model
model = tf.saved_model.load('use/', tags=None)
print('############ model is now loaded ############')

# Get Embedding Matrix
embedding_matrix = ut.get_embeddings(model, questions)

# Setup Slack Client API
rtm_client = RTMClient(token=SLACK_TOKEN, connect_method='rtm.start')

print('############ Starting RTM Client ############')

#rtm_client.start()

print('## Started ###')
# Start the client if it didn't start implicitly.
try:
    print('############ Inside Try ############')
    rtm_client.start()
    print('############ Exiting Try with No Exceptions ############')
except:
Esempio n. 7
0
file_list = []
allowed_extensions = ['.jpg']

for file in os.listdir(args['image']):
    valid_file = False
    for extension in allowed_extensions:
        if file.endswith(extension):
            valid_file = True
            break
    if valid_file == True:
        file_list.append(args['image'] + '/' + file)
print("detecting faces")
images, img_with_faces = util.load_and_align_data(file_list)
print("getting embeddings")
embeddings = util.get_embeddings(args['model'], images)
print('finding distinct groups')
faces = util.get_face_labels(embeddings,
                             max_clusters=int(args['max_clusters']),
                             opt_cluster_threshold=int(args['threshold']))
if args['output'] != None:
    i = 0
    for img in img_with_faces:
        person_name = 'person' + str(faces[i])
        cls_folder = args['output'] + '/' + person_name
        if not os.path.isdir(cls_folder):
            os.makedirs(cls_folder)
        new_path = cls_folder + '/' + str(i) + '.jpg'
        shutil.move(img, new_path)
        i += 1
Esempio n. 8
0
    def __init__(self, dataset, train, trans, device, params=None):
        self.dataset = dataset
        self.train = train
        self.basic = trans.basic
        self.augment = trans.augmentation
        self.params = params
        self.concatenated = (
            type(dataset) == torch.utils.data.dataset.ConcatDataset)
        if self.concatenated:
            if params["dset"] == "CIFAR10":
                self.data = torch.cat([
                    torch.from_numpy(self.dataset.datasets[i].data)
                    for i in range(len(self.dataset.datasets))
                ],
                                      dim=0)
                self.targets = torch.cat([
                    torch.tensor(self.dataset.datasets[i].targets)
                    for i in range(len(self.dataset.datasets))
                ],
                                         dim=0)
            elif params["dset"] == "MNIST" or params["dset"] == "FASHIONMNIST":
                self.data = torch.cat([
                    self.dataset.datasets[i].data
                    for i in range(len(self.dataset.datasets))
                ],
                                      dim=0)
                self.targets = torch.cat([
                    self.dataset.datasets[i].targets
                    for i in range(len(self.dataset.datasets))
                ],
                                         dim=0)
        else:
            self.targets = self.dataset.targets
            self.data = self.dataset.data

        if self.train:
            # Creates three lists of indices that will be called later.
            # The first index will be a specific image, the second will be the same image (augmented)
            # and the third will be a random image (most likely different if dataset is balanced)
            self.original_indices = np.arange(self.data.shape[0])

            regular_dataloader = torch.utils.data.DataLoader(
                SingleDataset(dataset,
                              augment=False,
                              trans=trans,
                              params=params),
                batch_size=params["batch_size"],
                shuffle=False)  # Do not shuffle

            print("Running data through previous network...")
            regular_embeddings, augmented_embeddings = get_embeddings(
                regular_dataloader, device, params)
            random_indices = np.copy(self.original_indices)
            np.random.shuffle(random_indices)
            augmented_distances, random_distances = [], []

            # if params["show_plots"]:
            #    # plt.title("Augmented and Random Distances (boundary is {:.3f})".format(boundary))
            #    # plt.axvline(x=rand_min, color='r')
            #    # plt.axvline(x=rand_max, color='r')
            #    plt.hist(random_distances.cpu(), bins=200, label="Random")
            #    # plt.hist(combined.cpu(), bins=bins, label="Combined")
            #    # plt.hist(augmented_distances.cpu(), bins=bins, label="Augmented")
            #    plt.legend()
            #    plt.show()

            print("RTM index is {0} (number of pairs is {1})".format(
                params["rtm_index"], params["num_pairs"]))

            if params["rtm_index"] is None:  # Do not use RTM
                # if params["curr_epoch"] == 0:
                indices_mask = np.arange(len(self.original_indices))
                self.original_indices = self.original_indices[indices_mask]
                self.similar_indices = np.copy(self.original_indices)
                self.different_indices = random_indices[indices_mask]
            else:
                random_distances, random_indices = [], []
                indices_matrix = np.random.choice(
                    self.original_indices,
                    (params["num_pairs"], len(self.original_indices)))
                for i in range(params["num_pairs"]):
                    shuffled_indices = np.copy(self.original_indices)
                    np.random.shuffle(shuffled_indices)
                    # distances.append(F.cosine_similarity(embeddings, embeddings[different_indices], 1))
                    random_distances.append(
                        torch.norm(regular_embeddings[self.original_indices] -
                                   regular_embeddings[indices_matrix[i, :]],
                                   p=2,
                                   dim=1))
                    random_indices.append(shuffled_indices)

                # random_indices = np.vstack(random_indices)
                random_distances = torch.stack(random_distances).cpu()
                different_selection = np.argpartition(
                    random_distances,
                    params["rtm_index"], axis=0)[params["rtm_index"], :].numpy(
                    )  # Gets the rtm_index-th nearest neighbor
                different_indices = indices_matrix[
                    different_selection,
                    np.arange(self.original_indices.shape[0])]
                self.different_indices = different_indices
                self.similar_indices = np.copy(self.original_indices)

            print("Dataset is size:", len(self.original_indices))

        else:
            # Creates three lists of indices that will be called later.
            # The first index will be a specific label, the second will be the same label,
            # and the third will be a different label
            self.labels_set = set(self.targets.numpy())
            x_original_indices, x_similar_indices, x_different_indices = [], [], []
            for label in self.labels_set:
                original_indices = np.arange(len(
                    self.targets))[np.where(self.targets == label)[0]]
                similar_indices = np.copy(original_indices)
                np.random.shuffle(similar_indices)
                different_indices = np.random.choice(
                    np.arange(len(
                        self.targets))[np.where(self.targets != label)[0]],
                    len(original_indices))
                x_original_indices.append(torch.from_numpy(original_indices))
                x_similar_indices.append(torch.from_numpy(similar_indices))
                x_different_indices.append(torch.from_numpy(different_indices))
            self.original_indices = torch.cat(x_original_indices, dim=0)
            self.similar_indices = torch.cat(x_similar_indices, dim=0)
            self.different_indices = torch.cat(x_different_indices, dim=0)
Esempio n. 9
0
def train():
    # Load data
    en, it = get_embeddings()  # Vocab x Embedding_dimension

    # Create data-loaders
    g_data_loader = torch.utils.data.DataLoader(CustomDataSet(en),
                                                batch_size=mini_batch_size,
                                                shuffle=True)
    d_data_loader = torch.utils.data.DataLoader(CustomDataSet(it),
                                                batch_size=mini_batch_size,
                                                shuffle=True)

    # Create models
    g = Generator(input_size=g_input_size,
                  hidden_size=g_hidden_size,
                  output_size=g_output_size)
    d = Discriminator(input_size=d_input_size,
                      hidden_size=d_hidden_size,
                      output_size=d_output_size)

    # Define loss function and optimizers
    loss_fn = torch.nn.BCELoss()
    d_optimizer = optim.Adam(d.parameters(),
                             lr=d_learning_rate,
                             betas=optim_betas)
    g_optimizer = optim.Adam(g.parameters(),
                             lr=g_learning_rate,
                             betas=optim_betas)

    if torch.cuda.is_available():
        # Move the network and the optimizer to the GPU
        g = g.cuda()
        d = d.cuda()
        loss_fn = loss_fn.cuda()

    for epoch in range(num_epochs):
        d_losses = []
        g_losses = []
        start_time = timer()
        g_iter = iter(g_data_loader)
        mini_batch = 1
        for d_real_data in d_data_loader:
            # Inspired from https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py
            for d_index in range(d_steps):
                # 1. Train D on real+fake
                d.zero_grad()  # Reset the gradients

                #  1A: Train D on real
                d_real_data = to_variable(
                    d_real_data)  # Could add some noise to the real data later
                d_real_decision = d(d_real_data)
                d_real_error = loss_fn(d_real_decision,
                                       to_variable(
                                           torch.ones(mini_batch_size,
                                                      1)))  # ones = true
                d_real_error.backward(
                )  # compute/store gradients, but don't change params
                d_losses.append(d_real_error.data.cpu().numpy())

                #  1B: Train D on fake
                d_gen_input = to_variable(next(g_iter))
                d_fake_data = g(d_gen_input).detach(
                )  # detach to avoid training G on these labels
                d_fake_decision = d(d_fake_data)  # Add noise later
                d_fake_error = loss_fn(d_fake_decision,
                                       to_variable(
                                           torch.zeros(mini_batch_size,
                                                       1)))  # zeros = fake
                d_fake_error.backward()
                d_losses.append(d_fake_error.data.cpu().numpy())
                d_optimizer.step(
                )  # Only optimizes D's parameters; changes based on stored gradients from backward()
                sys.stdout.write("[%d/%d] :: Discriminator Loss: %f \r" %
                                 (mini_batch, len(en) // mini_batch_size,
                                  np.asscalar(np.mean(d_losses))))
                sys.stdout.flush()
                mini_batch += 1

        mini_batch = 1
        for gen_input in g_data_loader:
            for g_index in range(g_steps):
                # 2. Train G on D's response (but DO NOT train D on these labels)
                g.zero_grad()

                gen_input = to_variable(gen_input)
                g_fake_data = g(gen_input)
                g_fake_decision = d(g_fake_data)  # Add noise later
                g_error = loss_fn(
                    g_fake_decision, to_variable(torch.ones(
                        mini_batch_size,
                        1)))  # we want to fool, so pretend it's all genuine
                g_losses.append(g_error.data.cpu().numpy())
                g_error.backward()
                g_optimizer.step()  # Only optimizes G's parameters

                sys.stdout.write("[%d/%d] :: Generator Loss: %f \r" %
                                 (mini_batch, len(en) // mini_batch_size,
                                  np.asscalar(np.mean(g_losses))))
                sys.stdout.flush()
                mini_batch += 1

        print(
            "Epoch {} : Discriminator Loss: {:.5f}, Generator Loss: {:.5f}, Time elapsed {:.2f} mins"
            .format(epoch, np.asscalar(np.mean(d_losses)),
                    np.asscalar(np.mean(g_losses)),
                    (timer() - start_time) / 60))
    return g