Ejemplo n.º 1
0
def run_clf(ebd_type, clf_type, index=None, result_suffix=''):
    start = time.time()
    print('start {} with model {}:'.format(ebd_type, clf_type))
    data, label = load_embedding(ebd_type, False, index)
    test_data, test_label = load_embedding(ebd_type, True)
    print('data loaded {}'.format(
        time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start))))
    if clf_type == knn:
        clf = KNeighborsClassifier()
    elif clf_type == svm:
        clf = SVC()
    elif clf_type == random_forest:
        clf = RandomForestClassifier()
    else:
        print('undefined clf: {}'.format(clf_type))
    clf.fit(data, label)
    print('model fitted {}'.format(
        time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start))))
    result = clf.predict(test_data)
    print('model predicted {}'.format(
        time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start))))
    with open(
            ebd_type + '_embedding/' + clf_type + '_result' + result_suffix +
            '.txt', 'w') as f:
        f.write(str(list(result)))
    print('end {} with model {}:'.format(ebd_type, clf_type))
def preload_wordvec_embed(dir_location):
    start = time.time()
    import dataloader
    embs = dataloader.load_embedding(
        os.path.join(dir_location, "embedding_filtered"))
    print("took {} seconds".format(time.time() - start))
    print("preloaded embeddings from amazon dataset.")
    print("")
    return embs
Ejemplo n.º 3
0
def preload_embed():
    start = time.time()
    import dataloader
    embs = dataloader.load_embedding(
        "/home/jessedd/data/amazon_categories/original_mix/embedding_filtered")
    print("took {} seconds".format(time.time() - start))
    print("preloaded embeddings from amazon dataset.")
    print("")
    return embs
Ejemplo n.º 4
0
def evaluate_f1_score(dtype=''):
    _, label = load_embedding(fasttext, True)
    for clf in clf_list:
        for ebd in embedding_list:
            if dtype != origin:
                predict = load_result(ebd, clf, '_' + dtype)
            else:
                predict = load_result(ebd, clf, '')
            print('type:\t{}\t clf: \t{}\t ebd: \t{}\t f1-score: \t{:.2f}'.
                  format(dtype, clf, ebd,
                         f1_score(label, predict, average='macro')))
Ejemplo n.º 5
0
    def __init__(self,
                 embedding,
                 hidden_size=150,
                 depth=1,
                 dropout=0.3,
                 cnn=False,
                 nclasses=2):
        super(Model, self).__init__()
        self.cnn = cnn
        self.drop = nn.Dropout(dropout)
        self.emb_layer = modules.EmbeddingLayer(
            embs=dataloader.load_embedding(embedding))
        self.word2id = self.emb_layer.word2id

        if cnn:
            self.encoder = modules.CNN_Text(self.emb_layer.n_d,
                                            widths=[3, 4, 5],
                                            filters=hidden_size)
            d_out = 3 * hidden_size
        else:
            self.encoder = nn.LSTM(
                self.emb_layer.n_d,
                hidden_size // 2,
                depth,
                dropout=dropout,
                # batch_first=True,
                bidirectional=True)
            d_out = hidden_size
        # else:
        #     self.encoder = SRU(
        #         emb_layer.n_d,
        #         args.d,
        #         args.depth,
        #         dropout = args.dropout,
        #     )
        #     d_out = args.d
        self.out = nn.Linear(d_out, nclasses)
Ejemplo n.º 6
0
def main(args):
    if args.dataset == 'mr':
        data, label = dataloader.read_MR(args.path)
    elif args.dataset == 'subj':
        data, label = dataloader.read_SUBJ(args.path)
    elif args.dataset == 'cr':
        data, label = dataloader.read_CR(args.path)
    elif args.dataset == 'mpqa':
        data, label = dataloader.read_MPQA(args.path)
    elif args.dataset == 'trec':
        train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
        data = train_x + test_x
        label = None
    elif args.dataset == 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
        data = train_x + valid_x + test_x
        label = None
    else:
        raise Exception("unknown dataset: {}".format(args.dataset))

    emb_layer = modules.EmbeddingLayer(
        args.d, data,
        embs = dataloader.load_embedding(args.embedding)
    )

    if args.dataset == 'trec':
        train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
            train_x, train_y,
            nfold = 10,
            valid_id = args.cv
        )
    elif args.dataset != 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
            data, label,
            nfold = 10,
            test_id = args.cv
        )

    nclasses = max(train_y)+1

    train_x, train_y = dataloader.create_batches(
        train_x, train_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    valid_x, valid_y = dataloader.create_batches(
        valid_x, valid_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    test_x, test_y = dataloader.create_batches(
        test_x, test_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )

    model = Model(args, emb_layer, nclasses).cuda()
    need_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(
        filter(need_grad, model.parameters()),
        lr = args.lr
    )

    best_valid = 1e+8
    test_err = 1e+8
    for epoch in range(args.max_epoch):
        best_valid, test_err = train_model(epoch, model, optimizer,
            train_x, train_y,
            valid_x, valid_y,
            test_x, test_y,
            best_valid, test_err
        )
        if args.lr_decay>0:
            optimizer.param_groups[0]['lr'] *= args.lr_decay

    sys.stdout.write("best_valid: {:.6f}\n".format(
        best_valid
    ))
    sys.stdout.write("test_err: {:.6f}\n".format(
        test_err
    ))
Ejemplo n.º 7
0
    logger.info(f"COMMAND: {cmd_msg}")
    write_to_file(f"{Meta.log_path}/cmd.txt", cmd_msg)

    logger.info(f"Config: {Meta.config}")
    write_to_file(f"{Meta.log_path}/config.txt", Meta.config)

    datasets = {}
    data = []

    for task_name in args.task:
        dataset, task_data = load_data(args.data_dir, task_name, args.cv)
        datasets[task_name] = dataset
        data += task_data

    emb_layer = EmbeddingLayer(
        args.dim, data, embs=load_embedding(args.embedding), fix_emb=args.fix_emb
    )

    dataloaders = []
    for task_name in args.task:
        dataloaders += create_dataloaders(
            task_name, datasets[task_name], args.batch_size, emb_layer.word2id
        )

    tasks = {
        task_name: create_task(
            task_name, args, datasets[task_name]["nclasses"], emb_layer
        )
        for task_name in args.task
    }
Ejemplo n.º 8
0
    write_to_file(f"{Meta.log_path}/cmd.txt", cmd_msg)

    logger.info(f"Config: {Meta.config}")
    write_to_file(f"{Meta.log_path}/config.txt", Meta.config)

    datasets = {}
    data = []

    for task_name in args.task:
        dataset, task_data = load_data(args.data_dir, task_name, args.cv)
        datasets[task_name] = dataset
        data += task_data

    emb_layer = EmbeddingLayer(args.dim,
                               data,
                               embs=load_embedding(args.embedding),
                               fix_emb=args.fix_emb)

    dataloaders = []
    for task_name in args.task:
        dataloaders += create_dataloaders(task_name, datasets[task_name],
                                          args.batch_size, emb_layer.word2id)

    tasks = {
        task_name: create_task(task_name, args,
                               datasets[task_name]["nclasses"], emb_layer)
        for task_name in args.task
    }

    model = EmmentalModel(name="TC_task")
Ejemplo n.º 9
0
def train():
  cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
  cmd.add_argument('--seed', default=1, type=int, help='The random seed.')
  cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.')

  cmd.add_argument('--train_path', required=True, help='The path to the training file.')
  cmd.add_argument('--valid_path', help='The path to the development file.')
  cmd.add_argument('--test_path', help='The path to the testing file.')

  cmd.add_argument('--config_path', required=True, help='the path to the config file.')
  cmd.add_argument("--word_embedding", help="The path to word vectors.")

  cmd.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'],
                   help='the type of optimizer: valid options=[sgd, adam, adagrad]')
  cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.')
  cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.')

  cmd.add_argument("--model", required=True, help="path to save model")
  
  cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.')
  cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.')
  
  cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.')

  cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.')

  cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.')

  cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.')

  cmd.add_argument('--save_classify_layer', default=False, action='store_true',
                   help="whether to save the classify layer")

  cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.")
  cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.')

  opt = cmd.parse_args(sys.argv[2:])

  with open(opt.config_path, 'r') as fin:
    config = json.load(fin)

  # Dump configurations
  print(opt)
  print(config)

  # set seed.
  torch.manual_seed(opt.seed)
  random.seed(opt.seed)
  if opt.gpu >= 0:
    torch.cuda.set_device(opt.gpu)
    if opt.seed > 0:
      torch.cuda.manual_seed(opt.seed)

  use_cuda = opt.gpu >= 0 and torch.cuda.is_available()

  token_embedder_name = config['token_embedder']['name'].lower()
  token_embedder_max_chars = config['token_embedder'].get('max_characters_per_token', None)
  if token_embedder_name == 'cnn':
    train_data = read_corpus(opt.train_path, token_embedder_max_chars, opt.max_sent_len)
  elif token_embedder_name == 'lstm':
    train_data = read_corpus(opt.train_path, opt.max_sent_len)
  else:
    raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))

  logging.info('training instance: {}, training tokens: {}.'.format(len(train_data),
                                                                    sum([len(s) - 1 for s in train_data])))

  if opt.valid_path is not None:
    if token_embedder_name == 'cnn':
      valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len)
    elif token_embedder_name == 'lstm':
      valid_data = read_corpus(opt.valid_path, opt.max_sent_len)
    else:
      raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
    logging.info('valid instance: {}, valid tokens: {}.'.format(len(valid_data),
                                                                sum([len(s) - 1 for s in valid_data])))
  elif opt.valid_size > 0:
    train_data, valid_data = divide(train_data, opt.valid_size)
    logging.info('training instance: {}, training tokens after division: {}.'.format(
      len(train_data), sum([len(s) - 1 for s in train_data])))
    logging.info('valid instance: {}, valid tokens: {}.'.format(
      len(valid_data), sum([len(s) - 1 for s in valid_data])))
  else:
    valid_data = None

  if opt.test_path is not None:
    if token_embedder_name == 'cnn':
      test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len)
    elif token_embedder_name == 'lstm':
      test_data = read_corpus(opt.test_path, opt.max_sent_len)
    else:
      raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))
    logging.info('testing instance: {}, testing tokens: {}.'.format(
      len(test_data), sum([len(s) - 1 for s in test_data])))
  else:
    test_data = None

  if opt.word_embedding is not None:
    embs = load_embedding(opt.word_embedding)
    word_lexicon = {word: i for i, word in enumerate(embs[0])}  
  else:
    embs = None
    word_lexicon = {}

  # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification
  vocab = get_truncated_vocab(train_data, opt.min_count)

  # Ensure index of '<oov>' is 0
  for special_word in ['<oov>', '<bos>', '<eos>',  '<pad>']:
    if special_word not in word_lexicon:
      word_lexicon[special_word] = len(word_lexicon)

  for word, _ in vocab:
    if word not in word_lexicon:
      word_lexicon[word] = len(word_lexicon)

  # Word Embedding
  if config['token_embedder']['word_dim'] > 0:
    word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs)
    logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id)))
  else:
    word_emb_layer = None
    logging.info('Vocabulary size: {0}'.format(len(word_lexicon)))

  # Character Lexicon
  if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    for sentence in train_data:
      for word in sentence:
        for ch in word:
          if ch not in char_lexicon:
            char_lexicon[ch] = len(char_lexicon)

    for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']:
      if special_char not in char_lexicon:
        char_lexicon[special_char] = len(char_lexicon)

    char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False)
    logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id)))
  else:
    char_lexicon = None
    char_emb_layer = None

  train = create_batches(
    train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda)

  if opt.eval_steps is None:
    opt.eval_steps = len(train[0])
  logging.info('Evaluate every {0} batches.'.format(opt.eval_steps))

  if valid_data is not None:
    valid = create_batches(
      valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
  else:
    valid = None

  if test_data is not None:
    test = create_batches(
      test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda)
  else:
    test = None

  label_to_ix = word_lexicon
  logging.info('vocab size: {0}'.format(len(label_to_ix)))
  
  nclasses = len(label_to_ix)

  model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda)
  logging.info(str(model))
  if use_cuda:
    model = model.cuda()

  need_grad = lambda x: x.requires_grad
  if opt.optimizer.lower() == 'adam':
    optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr)
  elif opt.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr)
  elif opt.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr)
  else:
    raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower()))

  try:
    os.makedirs(opt.model)
  except OSError as exception:
    if exception.errno != errno.EEXIST:
      raise

  if config['token_embedder']['char_dim'] > 0:
    with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo:
      for ch, i in char_emb_layer.word2id.items():
        print('{0}\t{1}'.format(ch, i), file=fpo)

  with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo:
    for w, i in word_lexicon.items():
      print('{0}\t{1}'.format(w, i), file=fpo)

  json.dump(vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8'))

  best_train = 1e+8
  best_valid = 1e+8
  test_result = 1e+8

  for epoch in range(opt.max_epoch):
    best_train, best_valid, test_result = train_model(epoch, opt, model, optimizer,
                                                      train, valid, test, best_train, best_valid, test_result)
    if opt.lr_decay > 0:
      optimizer.param_groups[0]['lr'] *= opt.lr_decay

  if valid_data is None:
    logging.info("best train ppl: {:.6f}.".format(best_train))
  elif test_data is None:
    logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format(best_train, best_valid))
  else:
    logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}.".format(best_train, best_valid, test_result))
Ejemplo n.º 10
0
def load_result(ebd_type, clf_type, result_suffix=''):
    dir = ebd_type + '_embedding/' + clf_type + '_result' + result_suffix + '.txt'
    with open(dir) as f:
        line = f.readline()
    return list(map(int, line.strip().strip('[').strip(']').split(',')))


def evaluate_f1_score(dtype=''):
    _, label = load_embedding(fasttext, True)
    for clf in clf_list:
        for ebd in embedding_list:
            if dtype != origin:
                predict = load_result(ebd, clf, '_' + dtype)
            else:
                predict = load_result(ebd, clf, '')
            print('type:\t{}\t clf: \t{}\t ebd: \t{}\t f1-score: \t{:.2f}'.
                  format(dtype, clf, ebd,
                         f1_score(label, predict, average='macro')))


if __name__ == '__main__':
    # for dtype in data_type_list:
    # dtype = small
    # for clf in clf_list:
    #     for ebd in embedding_list:
    #         run_clf(ebd, clf, load_index(dtype), '_' + dtype)
    # for dt in data_type_list:
    #     evaluate_f1_score(dt)
    ebd, _ = load_embedding(transformer)
    print(len(ebd[0]))
Ejemplo n.º 11
0
def main(args):
    datasetList = ['mr', 'subj', 'cr', 'mpqa', 'trec', 'sst']
    numberOfTest = 5
    args.max_epoch = 100
    for dset in datasetList:
        if dset == 'mr':
            data, label = dataloader.read_MR(args.path)
        elif dset == 'subj':
            data, label = dataloader.read_SUBJ(args.path)
        elif dset == 'cr':
            data, label = dataloader.read_CR(args.path)
        elif dset == 'mpqa':
            data, label = dataloader.read_MPQA(args.path)
        elif dset == 'trec':
            train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
            data = train_x + test_x
            label = None
        elif dset == 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
            data = train_x + valid_x + test_x
            label = None
        else:
            raise Exception("unknown dataset: {}".format(dset))

        emb_layer = modules.EmbeddingLayer(
            args.d, data,
            embs = dataloader.load_embedding(args.embedding)
        )

        if dset == 'trec':
            train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
                train_x, train_y,
                nfold = 10,
                valid_id = args.cv
            )
        elif dset != 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
                data, label,
                nfold = 10,
                test_id = args.cv
            )
        nclasses = max(train_y)+1

        train_x, train_y = dataloader.create_batches(train_x, train_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        valid_x, valid_y = dataloader.create_batches(valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        test_x, test_y = dataloader.create_batches(test_x, test_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')

        for models in range(3):
            if models == 1:
                args.cnn = True
                modelName = 'CNN'
            elif models == 2:
                args.cnn = False
                args.lstm = True
                modelName = 'LSTM'
            else:
                args.lstm = False
                modelName = 'SRU'

            sys.stdout.write("Training {} with {} architecture: \n".format(dset,modelName))
            args.dropout = 0.5


            for testNo in range(numberOfTest):
                model = Model(args, emb_layer, nclasses).cuda()
                need_grad = lambda x: x.requires_grad
                optimizer = optim.Adam(filter(need_grad, model.parameters()), lr = args.lr)

                best_valid = 1e+8
                test_err = 1e+8
                results = []
                for epoch in range(args.max_epoch):
                    results.append(train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err))
                
                with open('results_{d}_{m}_{i}.csv'.format(d=dset, m=modelName, i=(testNo+1)), 'wb') as dump:
                    wr = csv.writer(dump, delimiter=',')
                    wr.writerow(['Epoch','Training Loss', 'Validation Error', 'Test Error', 'Duration'])
                    for idx, value in enumerate(results):
                        wr.writerow(value)