Exemple #1
0
def main():
    # prepare vocab
    if not (os.path.exists(conf.vocab_file)
            and os.path.getsize(conf.vocab_file)):
        logger.info(("word dictionary does not exist, "
                     "build it from the training data"))
        build_dict(conf.train_file, conf.vocab_file, conf.max_word_num,
                   conf.cutoff_word_fre)
    logger.info("load word dictionary.")
    word_dict = load_dict(conf.vocab_file)
    logger.info("dictionay size = %d" % (len(word_dict)))

    cost = rnn_lm(len(word_dict), conf.emb_dim, conf.hidden_size,
                  conf.stacked_rnn_num, conf.rnn_type)

    # define reader
    reader_args = {
        "file_name": conf.train_file,
        "word_dict": word_dict,
    }
    train_reader = paddle.batch(paddle.reader.shuffle(
        reader.rnn_reader(**reader_args), buf_size=102400),
                                batch_size=conf.batch_size)
    test_reader = None
    if os.path.exists(conf.test_file) and os.path.getsize(conf.test_file):
        test_reader = paddle.batch(paddle.reader.shuffle(
            reader.rnn_reader(**reader_args), buf_size=65536),
                                   batch_size=conf.batch_size)

    train(topology=cost,
          train_reader=train_reader,
          test_reader=test_reader,
          model_save_dir=conf.model_save_dir,
          num_passes=conf.num_passes)
def main(args):

    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    # code.interact(local=locals())

    if os.path.isfile(args.vocab_file):
        en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(
            open(args.vocab_file, "rb"))
    else:
        en_dict, en_total_words = utils.build_dict(train_en)
        cn_dict, cn_total_words = utils.build_dict(train_cn)
        pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words],
                    open(args.vocab_file, "wb"))

    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)

    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    code.interact(local=locals())
Exemple #3
0
def main(args):
    # preprocessing: word(en, cn) -> number(one hot vector)

    # load sentences (English and Chinese)
    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    en_dict, en_total_words = utils.build_dict(train_en)
    cn_dict, cn_total_words = utils.build_dict(train_cn)
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words

    # encode the words into numbers
    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)

    # convert the train and dev data into numpy matrices
    # batch_size * seq_length
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    model = models.EncoderDecoderModel()

    crit = utils.LanguageModelCriterion()
    learning_rate = args.learning_rate
    optimizer = optim.Adam(model.parameter(), lr=learning_rate)

    for epoch in range(args.num_epochs):
        for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in enumerate(train_data):
            # convert numpy ndarray to Pytorch tensor
            # convert to Pytorch Variable
            batch_size = mb_x.shape[0]

            mb_x = Variable(torch.from_numpy(mb_x)).long()
            mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
            hidden = model.init_hidden(batch_size)
            mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])).long()

            mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

            # calculate loss function
            loss = crit(mb_pred, mb_out, mb_out_mask)

            # update the model
            optimizer.zero_grad()  # zero the previous gradient
            loss.backward()  # calculate gradient
            optimizer.step()  # gradient descent
def main():
    tweets, emojis = utils.load_data(max_example=100)
    word_dict = utils.build_dict(tweets)
    # embeddings = utils.generate_embeddings(word_dict, dim=50, pretrained_path='data/glove.twitter.27B.50d.txt')
    embeddings = utils.generate_embeddings(word_dict,
                                           dim=50,
                                           pretrained_path=None)
Exemple #5
0
 def transform_data(self):
     self.train_facts = add_reverse_relations(self.train_facts)
     self.entity_dict, self.relation_dict = build_dict(
         itertools.chain(self.train_facts,
                         add_reverse_relations(self.test_facts),
                         add_reverse_relations(self.valid_facts)),
         entity_dict=self.entity_dict,
         relation_dict=self.relation_dict)
     self.id2entity = sorted(self.entity_dict.keys(),
                             key=self.entity_dict.get)
     self.id2relation = sorted(self.relation_dict.keys(),
                               key=self.relation_dict.get)
     self.train_facts = translate_facts(self.train_facts,
                                        entity_dict=self.entity_dict,
                                        relation_dict=self.relation_dict)
     self.valid_facts = translate_facts(self.valid_facts,
                                        entity_dict=self.entity_dict,
                                        relation_dict=self.relation_dict)
     self.test_facts = translate_facts(self.test_facts,
                                       entity_dict=self.entity_dict,
                                       relation_dict=self.relation_dict)
     if self.rel2candidate:
         self.rel2candidate = {
             self.relation_dict[key]: list(map(self.entity_dict.get, value))
             for key, value in self.rel2candidate.items()
             if key in self.relation_dict
         }
     else:
         relations = set(map(lambda x: x[1], self.valid_facts)) | set(
             map(lambda x: x[1], self.test_facts))
         self.rel2candidate = {
             key: list(range(len(self.entity_dict)))
             for key in relations
         }
Exemple #6
0
    def train(self, dataset_train, labels, word_frequency = 15, document_frequency = 5):
        start = time.time()
        print("--------------------------------------------")
        print("%s Train Start" % self.name)

        self.labels = labels

        self.global_dict = utils.build_dict(dataset_train, word_frequency = word_frequency, document_frequency = document_frequency)
        self.num = len(self.global_dict)

        train_count = len(dataset_train)
        
        self.labels_word_total = {}
        self.labels_word_freq = {}
        self.labels_word_num = {}

        # 计算 每类文档 的概率
        for label in labels:
            self.labels_p[label] = len(utils.GetFileLists(os.path.join(train_path, label))) * 1.0 / train_count

        # 统计 每类文档 中的词频
        for name, data in dataset_train.items():
            label = name.split("/")[-2]

            if self.labels_word_freq.get(label) is None:
                self.labels_word_freq[label] = {}

            if self.labels_word_total.get(label) is None:
                self.labels_word_total[label] = 0
                self.labels_word_num[label] = 0

            for word, count in data.items():

                if self.global_dict.get(word) is None:
                    continue
                
                if self.labels_word_freq[label].get(word) is None:
                    self.labels_word_freq[label][word] = 1
                else:
                    self.labels_word_freq[label][word] += 1
                
                self.labels_word_total[label] += 1

        # 计算每个词的概率
        for label, data in self.labels_word_freq.items():
            if self.labels_word_p.get(label) is None:
                self.labels_word_p[label] = {}
            
            for word, count in data.items():
                if self.global_dict.get(word) is not None:
                    self.labels_word_p[label][word] = (count * 1.0 + 1) / (self.labels_word_total[label] + self.num)
        
        stop = time.time()
        print("%s Train Finished has cost %fs" % (self.name, stop - start))
Exemple #7
0
def main():
    # prepare vocab
    if not (os.path.exists(config.dic_path)
            and os.path.getsize(config.dic_path)):
        logger.info(("word dictionary does not exist, "
                     "build it from the training data"))
        build_dict(config.train_data_path, config.dic_path,
                   config.max_word_num, config.cutoff_word_fre)
    logger.info("load word dictionary.")
    word_dict = load_dict(config.dic_path)
    logger.info("dictionary size = %d" % (len(word_dict)))

    train(train_data_path=config.train_data_path,
          test_data_path=config.test_data_path,
          word_dict=word_dict,
          batch_size=config.batch_size,
          num_passes=config.num_passes,
          share_semantic_generator=config.share_semantic_generator,
          share_embed=config.share_embed,
          num_workers=config.num_workers,
          use_gpu=config.use_gpu)
Exemple #8
0
def init():
    path = config.data_path
    config.embedding_file = os.path.join(path, config.embedding_file)
    config.embedding_vocab = os.path.join(path, config.embedding_vocab)
    config.train_file = os.path.join(path, config.train_file)
    config.test_file = os.path.join(path, config.test_file)

    # Config log
    if config.log_file is None:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(message)s',
                            datefmt='%m-%d %H:%M')
    else:
        if not os.path.exists(config.save_path):
            os.makedirs(config.save_path)
        logging.basicConfig(filename=config.log_file,
                            filemode='a',
                            level=logging.DEBUG,
                            format='%(asctime)s %(message)s',
                            datefmt='%m-%d %H:%M')
    # Load data
    # data = (sentences, relations, e1_pos, e2_pos)
    train_data = utils.load_data(config.train_file)
    test_data = utils.load_data(config.test_file)

    logging.info('trian data: %d' % len(train_data[0]))
    logging.info('test data: %d' % len(test_data[0]))

    # Build vocab
    word_dict = utils.build_dict(train_data[0] + test_data[0])
    logging.info('total words: %d' % len(word_dict))

    embeddings = utils.load_embedding(config, word_dict)

    # Log parameters
    flags = config.__dict__['__flags']
    flag_str = "\n"
    for k in flags:
        flag_str += "\t%s:\t%s\n" % (k, flags[k])
    logging.info(flag_str)

    # vectorize data
    # vec = (sents_vec, relations, e1_vec, e2_vec, dist1, dist2)
    max_len_train = len(max(train_data[0], key=lambda x: len(x)))
    max_len_test = len(max(test_data[0], key=lambda x: len(x)))
    max_len = max(max_len_train, max_len_test)
    config.max_len = max_len

    train_vec = utils.vectorize(train_data, word_dict, max_len)
    test_vec = utils.vectorize(test_data, word_dict, max_len)

    return embeddings, train_vec, test_vec
Exemple #9
0
def handler(req, args):
        session_id = utils.get_cookie(req, "session").strip()

        if session_id == "":
		logging.warning('Unathorized attempt to access %s from %s' % (req.the_request, req.connection.remote_ip))
                return {"Location":"login.html", "error_msg":"Authorization required!"}

        con = MySQLdb.connect(  host = settings.database_settings["host"],
                                user = settings.database_settings["login"],
                                passwd = settings.database_settings["password"],
                                db = settings.database_settings["database"])

        cur = con.cursor()
        try:
                expired, user_id = utils.is_expired(cur, session_id)
                if expired:
                        return {"Location":"login.html", "error_msg":"You session has expired. Please log in"}

		if not args.has_key("id"):
			return {"Location":"editnews.html"}
		
		id = args["id"].strip()
		try:
			id = int(id)
		except ValueError:
			return {"Location":"editnews.html"}			

                if not preprocess.input_matches(req, args, expected_args):
                        cur.execute("""SELECT title, text FROM news WHERE id=%s """, id)
			row = cur.fetchone()
			if row is None:
				return {"Location":"editnews.html"}
			title = row[0]
			text = row[1]
			return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, [id, title, text]))

                title = args["title"].strip()
                text = args["body"]

                xss_strip = xss.XssCleaner()
                title = xss_strip.strip(title)
                text = xss_strip.strip(text)

                cur.execute("""UPDATE news SET title=%s, text=%s WHERE id=%s""", (title, text, id))
	finally:
                con.commit()
                cur.close()
                con.close()

        return {"Location":"editnews.html", "notice_msg":"Post saved successfully"}
Exemple #10
0
def handler(req, args):

       	if not preprocess.input_matches(req, args, expected_args):
               	return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, []))

	expresion = args["expresion"].strip()
	if expresion == "":
		return postprocess.fill_page(template_path, "", "The expresion is empty", args)
		
	try:
		contents = str(eval(expresion))
	except ZeroDivisionError:
		return postprocess.fill_page(template_path, "", "Division by Zero", args)
	except ValueError, OverflowError:
		return postprocess.fill_page(template_path, "", "Some function in expression does not support specified domain", args)
    def get(self):
        article = RequestHandler.get_argument(self, name='article')
        print('-----------------enter get...')
        article = translator.translate(article, src='auto', dest='en').text.lower().replace('.', ' .').replace(',', ' ,')

        try:
            print('---article_cn in get:', article)
            print('---article_en:', article)
        except Exception as e:
            print(str(e))
            pass

        print("Loading dictionary...")
        word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy)
        valid_x, valid_y = build_deploy(article, word_dict, article_max_len, summary_max_len)
        valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x))

        batches = batch_iter(valid_x, valid_y, args.batch_size, 1)
        print("Start auto summarization...")
        for batch_x, batch_y in batches:
            batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
            valid_feed_dict = {
                model.batch_size: len(batch_x),
                model.X: batch_x,
                model.X_len: batch_x_len,
            }
            t0 = time.time()
            prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
            prediction_output = list(map(lambda x: [reversed_dict[y] for y in x], prediction[:, 0, :]))

            print('inference time:', str(time.time() - t0) + 's')

            line = prediction_output[0]
            summary = list()
            for word in line:
                if word == "</s>":
                    break
                if word not in summary:
                    summary.append(word)
            title_pred = " ".join(summary)
            print('title_pred:', title_pred)
            title_cn = translator.translate(title_pred, src='auto', dest='zh-cn').text
            # print('title_cn:', title_cn)
            self.write(str(title_cn) + '\n')
Exemple #12
0
def main(args):
    args = config.get_args()
    print(args)
    entity_type=args.entity_type

    embeddings=args.embeddings

    split=False

    if split:
    # RUN THIS LINE TO CREATE THE SPLITS TO TRAIN-DEV-TEST
        train_file, outdir = create_data(entity_type, embeddings, args.experiment)
    else:
        # RUN THIS LINE IF YOU HAVE ALREADY DONE THE SPLITTING AND WANT TO ONLY CREATE EMBEDDINGS
        train_file='../data/%s/train.txt' % entity_type
        outdir='../data/%s/' % entity_type

    logging.info('-' * 50)
    logging.info('Load data files..')
    logging.info('*' * 10 + ' Train')
    train_examples = utils.load_data(train_file, embeddings)

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts, inv_word_dicts = utils.build_dict(train_examples, 3000)

    num_attr = len(inv_word_dicts)
    d_abs=1
    for i in inv_word_dicts:
        print(len(i))
        d_abs*=len(i)
    print("d_abs = %s" % "{:.2E}".format(Decimal(d_abs)))
    print("n_ex = %d" % len(train_examples))
    print("d_avgd = %s" % "{:.2E}".format(Decimal(d_abs/len(train_examples))))
    entropy = utils.compute_avg_entropy(train_examples, word_dicts)
    print("Entropy = %f" % entropy) 

    pickle.dump(word_dicts, open('%s/train_dicts.pickle' % outdir, 'wb'))
    pickle.dump(inv_word_dicts, open('%s/train_inv_dicts.pickle' % outdir, 'wb'))
def data_loader(args):
    train_data, train_labels = utils.get_raw_data(args.train_file)         # 获取一堆句子构成的列表
    val_data, val_labels = utils.get_raw_data(args.dev_file)

    args.catogories = ['EnterSports', 'Military', 'Economics', 'Technology', 'Government']
    args.cat_dict = dict(zip(args.catogories, range(len(args.catogories))))

    word_vocab, num_total_words = utils.build_dict(train_data)

    trainlabels_to_idx = [args.cat_dict[label] for label in train_labels]
    vallabels_to_idx = [args.cat_dict[label] for label in val_labels]

    train_data, train_labels = utils.encode(train_data, trainlabels_to_idx, word_vocab)
    val_data, val_labels = utils.encode(val_data, vallabels_to_idx, word_vocab)

    train_data = utils.pad_features(train_data, max_len=args.max_features)
    val_data = utils.pad_features(val_data, max_len=args.max_features)

    train_set = utils.batch(train_data.copy(), train_labels.copy(), args.batch_size)
    val_set = utils.batch(val_data.copy(), val_labels.copy(), args.batch_size)

    return train_set, val_set, num_total_words
Exemple #14
0
def handler(req, args):
        session_id = utils.get_cookie(req, "session").strip()

        if session_id == "":
		logging.warning('Unathorized attempt to access %s from %s' % (req.the_request, req.connection.remote_ip))
                return {"Location":"login.html", "error_msg":"Authorization required!"}

        con = MySQLdb.connect(  host = settings.database_settings["host"],
                                user = settings.database_settings["login"],
                                passwd = settings.database_settings["password"],
                                db = settings.database_settings["database"])

        cur = con.cursor()
        try:
                expired, user_id = utils.is_expired(cur, session_id)
		if expired:
			return {"Location":"login.html", "error_msg":"You session has expired. Please log in"}

		if not preprocess.input_matches(req, args, expected_args):
                	return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, []))

		title = args["title"].strip()
		text = args["body"]

		xss_strip = xss.XssCleaner()
		title = xss_strip.strip(title)
		text = xss_strip.strip(text)

		cur.execute("""INSERT INTO news (date, title, author, text) VALUES (now(), %s, %s, %s)""", (title, user_id, text))
		
        finally:
                con.commit()
                cur.close()
                con.close()

        return {"Location":"news.html", "notice_msg":"Post added successfully"}
Exemple #15
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         100,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       100,
                                       relabeling=args.relabeling)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       args.max_dev,
                                       relabeling=args.relabeling)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(
        set([w for w in word_dict.keys() if w.startswith('@entity')] +
            train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')
    if args.prepare_model:
        return train_fn, test_fn, params

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict,
                                                   entity_dict)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.test_only:
        return

    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y = utils.vectorize(
        train_examples, word_dict, entity_dict)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_l, train_y,
                             args.batch_size)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l,
                  mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' %
                         (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info(
                'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                % (epoch, idx, len(all_train), train_loss,
                   time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                logging.info('Train accuracy: %.2f %%' %
                             eval_acc(test_fn, sample_train))
                dev_acc = eval_acc(test_fn, all_dev)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file,
                                      params,
                                      epoch=epoch,
                                      n_updates=n_updates)
Exemple #16
0
def main(args):

    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    # code.interact(local=locals())

    if os.path.isfile(args.vocab_file):
        en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(
            open(args.vocab_file, "rb"))
    else:
        en_dict, en_total_words = utils.build_dict(train_en)
        cn_dict, cn_total_words = utils.build_dict(train_cn)
        pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words],
                    open(args.vocab_file, "wb"))

    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)

    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    if os.path.isfile(args.model_file):
        model = torch.load(args.model_file)
    elif args.model == "EncoderDecoderModel":
        model = EncoderDecoderModel(args)

    if args.use_cuda:
        model = model.cuda()

    crit = utils.LanguageModelCriterion()

    learning_rate = args.learning_rate
    optimizer = getattr(optim, args.optimizer)(model.parameters(),
                                               lr=learning_rate)

    total_num_sentences = 0.
    total_time = 0.
    for epoch in range(args.num_epochs):
        np.random.shuffle(train_data)
        total_train_loss = 0.
        total_num_words = 0.
        for idx, (mb_x, mb_x_mask, mb_y,
                  mb_y_mask) in tqdm(enumerate(train_data)):

            batch_size = mb_x.shape[0]
            total_num_sentences += batch_size
            mb_x = Variable(torch.from_numpy(mb_x)).long()
            mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
            hidden = model.init_hidden(batch_size)
            mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:]))

            if args.use_cuda:
                mb_x = mb_x.cuda()
                mb_x_mask = mb_x_mask.cuda()
                mb_input = mb_input.cuda()
                mb_out = mb_out.cuda()
                mb_out_mask = mb_out_mask.cuda()

            mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

            loss = crit(mb_pred, mb_out, mb_out_mask)
            num_words = torch.sum(mb_out_mask).data[0]
            total_train_loss += loss.data[0] * num_words
            total_num_words += num_words

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("training loss: %f" % (total_train_loss / total_num_words))
def handler(req, args):
	if args.has_key("error_msg"):
		error_msg = args["error_msg"]
	else:
		error_msg = ""
		

        if not preprocess.input_matches(req, args, expected_args):
	        return postprocess.fill_page(template_path, "", error_msg, utils.build_dict(expected_args, []))
	elif args.has_key("error_msg"):
		return postprocess.fill_page(template_path, "", error_msg, args)
		
	
        login = args["login"].strip()
        passwd = args["passwd"]

        if login == "" or passwd == "":
                return postprocess.fill_page(template_path, "", "Username or password not specified", args)


        con = MySQLdb.connect(  host = settings.database_settings["host"],
                                user = settings.database_settings["login"],
                                passwd = settings.database_settings["password"],
                                db = settings.database_settings["database"])

        cur = con.cursor()
        try:
                cur.execute("""	SELECT logins.id, login, passwd, name, surname 
				FROM logins JOIN emails
				ON logins.emails_ref = emails.id 
				WHERE login=%s and passwd=%s""", (login, md5.new(passwd).digest()) )
                result = cur.fetchone()
                if result is None:
                        return postprocess.fill_page(template_path, "", "Bad username or password", args)

		name = result[3]
		surname = result[4]

		#generate session id
		session_id = guid.generate("") # ANALYSIS FIX
		expire_time = utils.get_session_expire_time()
		cur.execute("""DELETE FROM sessions WHERE expire_time < now()""")
		cur.execute("""INSERT INTO sessions (session_id, expire_time, user_id) VALUES (%s, %s, %s) """, (session_id, expire_time, result[0]))
		
		#set cookie
		req.headers_out.add( "Set-Cookie", utils.forge_cookie("session", session_id, "/"))

		#process statistics
		UserAgent = ""
		if req.headers_in.has_key("User-Agent"):
			UserAgent = req.headers_in["User-Agent"]
		if UserAgent.find("/") > 0:
			UserAgent = UserAgent[0:UserAgent.find("/")]

		if len(UserAgent) > 0:
			cur.execute("SELECT id FROM stat_browser WHERE browser = '%s'" % (UserAgent,))
			result = cur.fetchone()
	                if result is None:
				cur.execute("INSERT INTO stat_browser (browser, counter) VALUES (%s, %s)", (UserAgent, 1))
			else:
				cur.execute("UPDATE stat_browser SET counter = counter + 1 WHERE id = %s", (result[0], ))
        finally:
		con.commit()
                cur.close()
                con.close()

	logging.info('Login of %s from %s' % (login, req.connection.remote_ip))
	return {"Location":"news.html", "notice_msg":"Hello, %s %s!" % (name, surname)}
Exemple #18
0
def handler(req, args):

	if not preprocess.input_matches(req, args, expected_args):
		return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, []))
	
	login = args["login"].strip()
	name = cgi.escape(args["name"].strip())
	surname = cgi.escape(args["surname"].strip())
	email = args["email"].strip()
	passwd = args["passwd"]

	if login == "":
		return postprocess.fill_page(template_path, "", "'login' is required field and cannot be empty", args)

	if validator.invalid_login_re.search(login):
		return postprocess.fill_page(template_path, "", "Only characters, numbers and underscore are allowed in login", args)

	if passwd == "":
		return postprocess.fill_page(template_path, "", "'password' is required field and cannot be empty", args)
	
	if passwd != args["passwd_confirm"]:
		return postprocess.fill_page(template_path, "", "Entered passwords do not match", args)

	if validator.invalid_passwd_re.search(passwd):
		return postprocess.fill_page(template_path, "", "Whitespaces are not allowed in passwords", args)

	if email == "":
		return postprocess.fill_page(template_path, "", "'email' is required field and cannot be empty", args)
	
	if not validator.valid_email_re.match(email):
		return postprocess.fill_page(template_path, "", "You have entered email address in bad format", args)


	con = MySQLdb.connect(	host = settings.database_settings["host"], 
				user = settings.database_settings["login"],
				passwd = settings.database_settings["password"],
				db = settings.database_settings["database"])

	cur = con.cursor()
	try:
		#check if this login was not used
		cur.execute("SELECT login FROM logins WHERE login=%s", (login, ) )
		result = cur.fetchone()
		if result:
			return postprocess.fill_page(template_path, "", "The specified login is already used by someone", args)

		#check if this email was already inserted
		cur.execute("""SELECT id, email FROM emails WHERE email='%s'""" % (email, ) )
		result = cur.fetchone()

		if result is None:
			cur.execute("""INSERT INTO emails (email, name, surname) VALUES (%s, %s, %s)""", (email, name, surname))
			cur.execute("""SELECT LAST_INSERT_ID() """)
			result = cur.fetchone()

		cur.execute("""INSERT INTO logins (login, passwd, emails_ref) VALUES (%s, %s, %s)""", (login, md5.new(passwd).digest(), int(result[0])))
	finally:
		con.commit()
		cur.close()
		con.close()

	return {"Location":"login.html", "notice_msg":"Registration successful!"}
Exemple #19
0
def main():

    start = timer()

    if (os.path.isfile("data/tweets" + str(max_example) + ".npy")
            and os.path.isfile("data/emojis" + str(max_example) + ".npy")):
        tweets = np.load("data/tweets" + str(max_example) + ".npy").tolist()
        emojis = np.load("data/emojis" + str(max_example) + ".npy").tolist()
    else:
        tweets, emojis = utils.load_data(path='data/final_train',
                                         max_example=max_example)
        np.save("data/tweets" + str(max_example) + ".npy", np.array(tweets))
        np.save("data/emojis" + str(max_example) + ".npy", np.array(emojis))

    if (os.path.isfile("data/dev_tweets" + str(max_dev_example) + ".npy") and
            os.path.isfile("data/dev_emojis" + str(max_dev_example) + ".npy")):
        dev_tweets = np.load("data/dev_tweets" + str(max_dev_example) +
                             ".npy").tolist()
        dev_emojis = np.load("data/dev_emojis" + str(max_dev_example) +
                             ".npy").tolist()
    else:
        dev_tweets, dev_emojis = utils.load_data(max_example=max_dev_example)
        np.save("data/dev_tweets" + str(max_dev_example) + ".npy",
                np.array(dev_tweets))
        np.save("data/dev_emojis" + str(max_dev_example) + ".npy",
                np.array(dev_emojis))

    start1 = timer()
    print(start1 - start)

    word_dict = utils.build_dict(tweets)
    # embeddings = utils.generate_embeddings(word_dict, dim=300, pretrained_path='data/glove.6B.300d.txt')
    embeddings = utils.generate_embeddings(word_dict,
                                           dim=300,
                                           pretrained_path=None)

    end0 = timer()
    print(end0 - start1)

    x, y = utils.vectorize(tweets, emojis, word_dict)
    dev_x, dev_y = utils.vectorize(dev_tweets, dev_emojis, word_dict)

    end1 = timer()
    print(end1 - end0)

    batch_size, input_size, hidden_size, output_size, layers = 32, 300, 200, 20, 1
    all_train = utils.generate_batches(x, y, batch_size=batch_size)
    all_dev = utils.generate_batches(dev_x, dev_y, batch_size=batch_size)

    end2 = timer()
    print(end2 - end1)

    # set the parameters
    # batch_size, input_size, hidden_size, output_size, layers = 64, 50, 200, 20, 1
    vocabulary_size = len(embeddings)

    if run_GRU:
        print("running GRU...")
        # initialize the model
        model = GRU_Classifier(vocabulary_size, input_size, hidden_size,
                               output_size, layers, run_BD_GRU)
        model.word_embeddings.weight.data = torch.FloatTensor(
            embeddings.tolist())
        if torch.cuda.is_available():
            model.cuda()
            (model.word_embeddings.weight.data).cuda()

        loss_function = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            loss_function.cuda()

        optimizer = optim.Adam(model.parameters(), lr=global_learning_rate)
        epoch_num = 500
        it = 0
        best_dev_acc = 0
        best_f1 = 0

        # model training
        for epoch in range(epoch_num):
            np.random.shuffle(all_train)
            for idx, (mb_x, mb_y, mb_lengths) in enumerate(all_train):
                # sort the input in descending order according to sentence length
                # This is required by nn.utils.rnn.pack_padded_sequence
                sorted_index = len_value_argsort(mb_lengths)
                mb_x = [mb_x[i] for i in sorted_index]
                mb_y = [mb_y[i] for i in sorted_index]
                mb_lengths = [mb_lengths[i] for i in sorted_index]

                print('#Examples = %d, max_seq_len = %d' %
                      (len(mb_x), len(mb_x[0])))
                mb_x = Variable(torch.from_numpy(np.array(mb_x,
                                                          dtype=np.int64)),
                                requires_grad=False)
                if torch.cuda.is_available():
                    mb_x = mb_x.cuda()

                y_pred = model(mb_x.t(), mb_lengths)
                mb_y = Variable(torch.from_numpy(np.array(mb_y,
                                                          dtype=np.int64)),
                                requires_grad=False)
                if torch.cuda.is_available():
                    mb_y = mb_y.cuda()
                loss = loss_function(y_pred, mb_y)
                # print('epoch ', epoch, 'batch ', idx, 'loss ', loss.data[0])

                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()
                it += 1

                if it % 100 == 0:  # every 100 updates, check dev accuracy
                    correct = 0
                    n_examples = 0
                    ground_truth = []
                    predicted = []
                    for idx, (d_x, d_y, d_lengths) in enumerate(all_dev):
                        ground_truth += d_y
                        n_examples += len(d_x)

                        sorted_index = len_value_argsort(d_lengths)
                        d_x = [d_x[i] for i in sorted_index]
                        d_y = [d_y[i] for i in sorted_index]
                        d_lengths = [d_lengths[i] for i in sorted_index]

                        d_x = Variable(torch.from_numpy(
                            np.array(d_x, dtype=np.int64)),
                                       requires_grad=False)
                        if torch.cuda.is_available():
                            d_x = d_x.cuda()

                        # use pytorch way to calculate the correct count
                        d_y = Variable(torch.from_numpy(
                            np.array(d_y, dtype=np.int64)),
                                       requires_grad=False)
                        if torch.cuda.is_available():
                            d_y = d_y.cuda()
                        y_pred = model(d_x.t(), d_lengths)
                        predicted += list(
                            torch.max(y_pred, 1)[1].view(d_y.size()).data)
                        correct += (torch.max(y_pred, 1)[1].view(
                            d_y.size()).data == d_y.data).sum()

                    dev_acc = correct / n_examples
                    f1 = f1_score(ground_truth, predicted, average='macro')
                    print("Dev Accuracy: %f, F1 Score: %f" % (dev_acc, f1))
                    if f1 > best_f1:
                        best_f1 = f1
                        print("Best F1 Score: %f" % best_f1)
                        gru_output = open('./out/gru_best', 'w')
                        gru_output.write(str(ground_truth) + '\n')
                        gru_output.write(str(predicted) + '\n')
                        gru_output.write(str(best_f1) + ' ' + str(dev_acc))
                        gru_output.close()

                    if dev_acc > best_dev_acc:
                        best_dev_acc = dev_acc
                        print("Best Dev Accuracy: %f" % best_dev_acc)

    if run_LSTM:
        print("Running LSTM...")
        model = LSTM_Classifier(vocabulary_size, input_size, hidden_size,
                                output_size, layers, run_BD_LSTM)
        model.word_embeddings.weight.data = torch.FloatTensor(
            embeddings.tolist())
        if torch.cuda.is_available():
            model.cuda()
            (model.word_embeddings.weight.data).cuda()

        loss_function = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            loss_function.cuda()

        optimizer = optim.Adam(model.parameters(), lr=global_learning_rate)
        it = 0
        best_dev_acc = 0
        best_f1 = 0
        epoch_num = 500

        # train LSTM
        for epoch in range(epoch_num):
            np.random.shuffle(all_train)

            for idx, (mb_x, mb_y, mb_lengths) in enumerate(all_train):
                sorted_index = len_value_argsort(mb_lengths)
                mb_x = [mb_x[i] for i in sorted_index]
                mb_y = [mb_y[i] for i in sorted_index]
                mb_lengths = [mb_lengths[i] for i in sorted_index]
                print('#Examples = %d, max_seq_len = %d' %
                      (len(mb_x), len(mb_x[0])))

                mb_x = Variable(torch.from_numpy(np.array(mb_x,
                                                          dtype=np.int64)),
                                requires_grad=False)
                if torch.cuda.is_available():
                    mb_x = mb_x.cuda()

                y_pred = model(mb_x.t(), mb_lengths)
                mb_y = Variable(torch.from_numpy(np.array(mb_y,
                                                          dtype=np.int64)),
                                requires_grad=False)
                if torch.cuda.is_available():
                    mb_y = mb_y.cuda()

                loss = loss_function(y_pred, mb_y)
                # print('epoch ', epoch, 'batch ', idx, 'loss ', loss.data[0])

                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()
                it += 1

                if it % 100 == 0:  # every 100 updates, check dev accuracy
                    correct = 0
                    n_examples = 0
                    ground_truth = []
                    predicted = []
                    for idx, (d_x, d_y, d_lengths) in enumerate(all_dev):
                        ground_truth += d_y
                        n_examples += len(d_x)

                        sorted_index = len_value_argsort(d_lengths)
                        d_x = [d_x[i] for i in sorted_index]
                        d_y = [d_y[i] for i in sorted_index]
                        d_lengths = [d_lengths[i] for i in sorted_index]

                        d_x = Variable(torch.from_numpy(
                            np.array(d_x, dtype=np.int64)),
                                       requires_grad=False)
                        if torch.cuda.is_available():
                            d_x = d_x.cuda()

                        d_y = Variable(torch.from_numpy(
                            np.array(d_y, dtype=np.int64)),
                                       requires_grad=False)
                        if torch.cuda.is_available():
                            d_y = d_y.cuda()
                        y_pred = model(d_x.t(), d_lengths)
                        predicted += list(
                            torch.max(y_pred, 1)[1].view(d_y.size()).data)
                        correct += (torch.max(y_pred, 1)[1].view(
                            d_y.size()).data == d_y.data).sum()

                    dev_acc = correct / n_examples
                    f1 = f1_score(ground_truth, predicted, average='macro')
                    print("Dev Accuracy: %f, F1 Score: %f" % (dev_acc, f1))
                    if f1 > best_f1:
                        best_f1 = f1
                        print("Best F1 Score: %f" % best_f1)
                        lstm_output = open('./out/lstm_best', 'w')
                        lstm_output.write(str(ground_truth) + '\n')
                        lstm_output.write(str(predicted) + '\n')
                        lstm_output.write(str(best_f1) + ' ' + str(dev_acc))
                        lstm_output.close()

                    if dev_acc > best_dev_acc:
                        best_dev_acc = dev_acc
                        print("Best Dev Accuracy: %f" % best_dev_acc)
Exemple #20
0
    parser.add_argument("--num_epochs", type=int, default=128, help="Number of epochs.")
    parser.add_argument("--keep_prob", type=float, default=0.9, help="Dropout keep prob.")
    parser.add_argument("--restoreInTrain", type=bool, default=True, help="restore in train")
    parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data")
    parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model")
    parser.add_argument("--checkoutPath", type=str, default='saved_model/checkpoint', help='save path')

parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()

with open("args.pickle", "rb") as f:
    args = pickle.load(f)

print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict()
print("Loading training dataset...")
valid_x, valid_y = get_text_list1(flag="dev")

valid_x_len = [len([y for y in x if y != 0]) for x in valid_x]

with tf.Session() as sess:
    print("Loading saved model...")
    model = getModel(sess, reversed_dict, article_max_len, summary_max_len, args, forward=True)
    # model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
    # saver = tf.train.Saver(tf.global_variables())
    # ckpt = tf.train.get_checkpoint_state("./saved_model/")
    # if ckpt:
    #     saver.restore(sess, tf.train.latest_checkpoint(ckpt.model_checkpoint_path))

    #batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)
Exemple #21
0
    filelist = utils.GetFileLists(train_path)

    if not os.path.exists(tf_path):

        result = utils.ReadDirsToStem(raw_path)

        with open(file=tf_path, mode="w", encoding="ISO-8859-1") as f:
            f.write(str(result))

    with open(file=tf_path, mode="r", encoding="ISO-8859-1") as f:
        result = eval(f.read())
        result_new = {}
        num_document = len(filelist)
        print(num_document)
        for file in filelist:
            result_new[file] = result[os.path.join(
                raw_path,
                os.path.join(file.split("/")[-2],
                             file.split("/")[-1]))]

    dic = utils.build_dict(result_new,
                           word_frequency=word_frequency,
                           document_frequency=document_frequency)

    dic_names = []
    for key in dic:
        dic_names.append(key)

    vector_space = buildVSM(result_new, dic, dic_names, num_document)
                    if X_test[i][j] != 0:
                        _prob += math.log(
                            self.prob[int(c) - 1][j]) * X_test[i][j]
                if _prob > _max:
                    _max = _prob
                    _c = c
            y_pred.append(_c)
        return y_pred

    def accuracy(self, y_test, y_pred):
        count = 0
        for i in range(len(y_test)):
            if str(y_test[i]) == str(y_pred[i]):
                count += 1
        print('Acc: %.2f' % (count * 100 / len(y_test)), end=' %')


if __name__ == '__main__':
    X_, y = utils.parse_file('training_data.txt')
    utils.build_dict(X_)
    DICT = utils.load_dict()
    X = np.zeros((len(X_), len(DICT)))
    for i in range(len(X_)):
        X[i] = utils.bag_of_word(X_[i], DICT)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    model = MultinomialNB(0.1)
    model.fit(X_train, y_train)
    print(X_test.shape[1])
    y_pred = model.predict(X_test)
    model.accuracy(y_test, y_pred)
Exemple #23
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 5, relabeling=args.relabeling,
                                         remove_notfound=args.remove_notfound)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling,
                                       remove_notfound=False)
    #elif args.test_only:
    #    logging.info('*' * 10 + ' Train')
    #    #train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling)  # docs, qs, ans
    #    train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound)  # docs, qs, ans
    #    logging.info('*' * 10 + ' Dev')
    #    dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling,
    #                                   remove_notfound=False)
    elif args.cnn_train:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling, has_ids=args.train_has_ids)  # docs, qs, ans
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_cnn_data(args.dev_file, args.max_dev, relabeling=args.relabeling, has_ids=args.dev_has_ids)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling,
                                         remove_notfound=args.remove_notfound)  # docs, qs, ans
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling,
                                       remove_notfound=False)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1],  # + dev_examples[0] + dev_examples[1],
                                 max_words=args.max_words)  # docs+qs
    entity_markers = list(set([w for w in word_dict.keys()
                              if w.startswith('@entity')] + train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    inv_entity_dict = {index: w for w, index in entity_dict.items()}
    assert len(entity_dict) == len(inv_entity_dict)
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y, dev_ids = utils.vectorize(dev_examples, word_dict, entity_dict,
                                                   remove_notfound=False,
                                                   relabeling=args.relabeling)
    if dev_ids is not None:
        assert len(dev_y) == len(dev_ids)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc, dev_preds = eval_acc(test_fn, all_dev)

    if dev_ids is not None:
        assert len(dev_ids) == len(dev_preds) == len(dev_y)
        dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.log_file is not None:
        assert args.log_file.endswith(".log")
        run_name = args.log_file[:args.log_file.find(".log")]
        if dev_ids is not None:
            preds_file_name = run_name + ".preds"
            utils.write_preds(dev_preds_data, preds_file_name)
            utils.external_eval(preds_file_name,
                                run_name + ".preds.scores",
                                eval_data="test" if "test" in os.path.basename(args.dev_file) else "dev")
    if args.test_only:
        return

    if args.log_file is not None:
        utils.save_params(run_name + ".model", params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y, train_ids = utils.vectorize(train_examples, word_dict, entity_dict,
                                                           remove_notfound=args.remove_notfound,
                                                           relabeling=args.relabeling)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0
    train_accs = []
    dev_accs = []
    all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size)
    improved = []
    for epoch in range(args.num_epoches):
        ep_acc_improved = False
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' %
                         (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                train_acc, train_preds = eval_acc(test_fn, sample_train)
                train_accs.append(train_acc)
                logging.info('Train accuracy: %.2f %%' % train_acc)
                dev_acc, dev_preds = eval_acc(test_fn, all_dev)
                dev_accs.append(dev_acc)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                utils.update_plot(args.eval_iter, train_accs, dev_accs, file_name=args.log_file + ".html")
                if dev_acc > best_acc:
                    ep_acc_improved = True
                    best_acc = dev_acc
                    logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                                 % (epoch, n_updates, dev_acc))
                    if args.log_file is not None:
                        utils.save_params(run_name + ".model", params, epoch=epoch, n_updates=n_updates)
                        if dev_ids is not None:
                            dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling)
                            utils.write_preds(dev_preds_data, preds_file_name)
                            utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="dev")
        improved.append(ep_acc_improved)
        # early stop
        if len(improved) > 25 and sum(improved[-3:]) == 0:
            break
# -*- coding: utf-8 -*-

from pca import kpca_train, pca_train
from utils import build_dict
from cv2_implementation import detect_faces
import sys

if len(sys.argv) < 2:
    print('An input mode is required: \'pca\' or \'kpca\'')
    sys.exit(1)
mode = sys.argv[1]
if mode != 'pca' and mode != 'kpca':
    print('Invalid option. Input mode must be either \'pca\' or \'kpca\'')
    sys.exit(1)

print('Please wait. Training is in progress...')
if mode == 'pca':
    eigenfaces = pca_train()
else:
    eigenfaces = kpca_train()
print('Training ready.')

names = build_dict()
detect_faces(mode, eigenfaces, names)






Exemple #25
0
def main(args):
    logging.info('-' * 50 + '')
    logging.info('Loading data...')
    if args.debug:
        train_examples = utils.load_data(args.train_file, 100)
        dev_examples = utils.load_data(args.dev_file, 100)
    else:
        train_examples = utils.load_data(args.train_file)
        dev_examples = utils.load_data(args.dev_file)

    args.num_train = len(train_examples[1])
    args.num_dev = len(dev_examples[1])

    logging.info('-' * 50)
    logging.info('Building dictionary...')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(
        set([w for w in word_dict.keys() if w.startswith('@entity')] +
            train_examples[2]))
    entity_markers = ['<entity_unk>'] + entity_markers
    entity_dict = {w: i for (i, w) in enumerate(entity_markers)}
    logging.info('# of Entity Markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    logging.info('Generating embedding...')
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    embeddings = embeddings.astype('float32')
    args.vocab_size, args.embedding_size = embeddings.shape

    logging.info('-' * 50)
    logging.info('Creating TF computation graph...')

    if args.rnn_type == 'lstm':
        logging.info('Using LSTM Cells')
    elif args.rnn_type == 'gru':
        logging.info('Using GRU Cells')

    # tf.reset_default_graph()
    d_input = tf.placeholder(dtype=tf.int32,
                             shape=(None, None),
                             name="d_input")
    q_input = tf.placeholder(
        dtype=tf.int32, shape=(None, None),
        name="q_input")  # [batch_size, max_seq_length_for_batch]
    l_mask = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name="l_mask")  # [batch_size, entity num]
    y = tf.placeholder(dtype=tf.int32, shape=None,
                       name="label")  # batch size vector
    y_1hot = tf.placeholder(
        dtype=tf.float32, shape=(None, None),
        name="label_1hot")  # onehot encoding of y [batch_size, entitydict]
    training = tf.placeholder(dtype=tf.bool)

    word_embeddings = tf.get_variable(
        "glove",
        shape=(args.vocab_size, args.embedding_size),
        initializer=tf.constant_initializer(embeddings))

    W_bilinear = tf.Variable(
        tf.random_uniform((2 * args.hidden_size, 2 * args.hidden_size),
                          minval=-0.01,
                          maxval=0.01))

    with tf.variable_scope(
            'd_encoder'):  # Encoding Step for Passage (d_ for document)
        d_embed = tf.nn.embedding_lookup(
            word_embeddings, d_input
        )  # Apply embeddings: [batch, max passage length in batch, GloVe Dim]
        d_embed_dropout = tf.layers.dropout(
            d_embed, rate=args.dropout_rate,
            training=training)  # Apply Dropout to embedding layer
        if args.rnn_type == 'lstm':
            d_cell_fw = rnn.LSTMCell(args.hidden_size)
            d_cell_bw = rnn.LSTMCell(args.hidden_size)
        elif args.rnn_type == 'gru':
            d_cell_fw = rnn.GRUCell(
                args.hidden_size
            )  # TODO: kernel_initializer=tf.random_normal_initializer(0,0.1) not working for 1.1
            d_cell_bw = rnn.GRUCell(args.hidden_size)

        d_outputs, _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw,
                                                       d_cell_bw,
                                                       d_embed_dropout,
                                                       dtype=tf.float32)
        d_output = tf.concat(
            d_outputs, axis=-1
        )  # [batch, len, h], len is the max passage length, and h is the hidden size

    with tf.variable_scope('q_encoder'):  # Encoding Step for Question
        q_embed = tf.nn.embedding_lookup(word_embeddings, q_input)
        q_embed_dropout = tf.layers.dropout(q_embed,
                                            rate=args.dropout_rate,
                                            training=training)
        if args.rnn_type == 'lstm':
            q_cell_fw = rnn.LSTMCell(args.hidden_size)
            q_cell_bw = rnn.LSTMCell(args.hidden_size)
        elif args.rnn_type == 'gru':
            q_cell_fw = rnn.GRUCell(args.hidden_size)
            q_cell_bw = rnn.GRUCell(args.hidden_size)
        q_outputs, q_laststates = tf.nn.bidirectional_dynamic_rnn(
            q_cell_fw, q_cell_bw, q_embed_dropout, dtype=tf.float32)
        if args.rnn_type == 'lstm':
            q_output = tf.concat([q_laststates[0][-1], q_laststates[1][-1]],
                                 axis=-1)  # (batch, h)
        elif args.rnn_type == 'gru':
            q_output = tf.concat(q_laststates, axis=-1)  # (batch, h)

    with tf.variable_scope('bilinear'):  # Bilinear Layer (Attention Step)
        # M computes the similarity between each passage word and the entire question encoding
        M = d_output * tf.expand_dims(tf.matmul(q_output, W_bilinear),
                                      axis=1)  # [batch, h] -> [batch, 1, h]
        # alpha represents the normalized weights representing how relevant the passage word is to the question
        alpha = tf.nn.softmax(tf.reduce_sum(M, axis=2))  # [batch, len]
        # this output contains the weighted combination of all contextual embeddings
        bilinear_output = tf.reduce_sum(d_output *
                                        tf.expand_dims(alpha, axis=2),
                                        axis=1)  # [batch, h]

    with tf.variable_scope('dense'):  # Prediction Step
        # the final output has dimension [batch, entity#], giving the probabilities of an entity being the answer for examples
        final_prob = tf.layers.dense(
            bilinear_output,
            units=args.num_labels,
            activation=tf.nn.softmax,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-0.01, maxval=0.01))  # [batch, entity#]

    pred = final_prob * l_mask  # ignore entities that don't appear in the passage
    train_pred = pred / tf.expand_dims(
        tf.reduce_sum(pred, axis=1),
        axis=1)  # redistribute probabilities ignoring certain labels
    train_pred = tf.clip_by_value(train_pred, 1e-7, 1.0 - 1e-7)

    test_pred = tf.cast(tf.argmax(pred, axis=-1), tf.int32)
    acc = tf.reduce_sum(tf.cast(tf.equal(test_pred, y), tf.int32))

    loss_op = tf.reduce_mean(
        -tf.reduce_sum(y_1hot * tf.log(train_pred), reduction_indices=[1]))
    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=args.learning_rate)
    train_op = optimizer.minimize(loss_op)
    logging.info('Done!')

    logging.info('-' * 50)
    logging.info('Printing args...')
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Initial Test...')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict,
                                                   entity_dict)
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)

    dev_acc = 0.  # TODO: first dev accuracy displays here
    logging.info('Dev Accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    saver = tf.train.Saver()

    logging.info('-' * 50)
    logging.info('Testing...')
    if args.test_only:
        if args.test_file == None:
            return ValueError("No test file specified")
        test_examples = utils.load_data(args.test_file)
        test_x1, test_x2, test_l, test_y = utils.vectorize(
            test_examples, word_dict, entity_dict)
        all_test = gen_examples(test_x1, test_x2, test_l, test_y,
                                args.batch_size)
        with tf.Session() as sess:
            # saver = tf.train.import_meta_graph(args.model_path + '.meta')
            saver.restore(sess, args.model_path)
            # TODO: which file to restore?

            correct = 0
            n_examples = 0
            for t_x1, t_mask1, t_x2, t_mask2, t_l, t_y in all_test:
                correct += sess.run(acc,
                                    feed_dict={
                                        d_input: t_x1,
                                        q_input: t_x2,
                                        y: t_y,
                                        l_mask: t_l,
                                        training: False
                                    })
                n_examples += len(t_x1)
            test_acc = correct * 100. / n_examples
            logging.info('Test Accuracy: %.2f %%' % test_acc)
        return

    logging.info('-' * 50)
    logging.info('Start training...')
    train_x1, train_x2, train_l, train_y = utils.vectorize(
        train_examples, word_dict, entity_dict)
    all_train = gen_examples(train_x1, train_x2, train_l, train_y,
                             args.batch_size)

    init = tf.global_variables_initializer()

    start_time = time.time()
    n_updates = 0
    with tf.Session() as sess:
        sess.run(init)
        for e in range(args.num_epoches):
            np.random.shuffle(all_train)
            for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l,
                      mb_y) in enumerate(all_train):
                logging.info(
                    'Batch Size = %d, # of Examples = %d, max_len = %d' %
                    (mb_x1.shape[0], len(mb_x1), mb_x1.shape[1]))

                y_label = np.zeros((mb_x1.shape[0], args.num_labels))
                for r, i in enumerate(
                        mb_y):  # convert (batch) -> (batch, entity_size)
                    y_label[r][i] = 1.

                _, train_loss = sess.run(
                    [train_op, loss_op],
                    feed_dict={
                        d_input: mb_x1,
                        q_input: mb_x2,
                        y_1hot: y_label,
                        l_mask: mb_l,
                        training: True
                    })
                logging.info(
                    'Epoch = %d, Iter = %d (max = %d), Loss = %.2f, Elapsed Time = %.2f (s)'
                    % (e, idx, len(all_train), train_loss,
                       time.time() - start_time))
                n_updates += 1

                if n_updates % args.eval_iter == 0:
                    saver.save(sess, args.model_path, global_step=e)
                    correct = 0
                    n_examples = 0
                    for d_x1, d_mask1, d_x2, d_mask2, d_l, d_y in all_dev:
                        correct += sess.run(acc,
                                            feed_dict={
                                                d_input: d_x1,
                                                q_input: d_x2,
                                                y: d_y,
                                                l_mask: d_l,
                                                training: False
                                            })
                        n_examples += len(d_x1)
                    dev_acc = correct * 100. / n_examples
                    logging.info('Dev Accuracy: %.2f %%' % dev_acc)
                    if dev_acc > best_acc:
                        best_acc = dev_acc
                        logging.info(
                            'Best Dev Accuracy: epoch = %d, n_updates (iter) = %d, acc = %.2f %%'
                            % (e, n_updates, dev_acc))

        logging.info('-' * 50)
        logging.info('Training Finished...')
        logging.info("Model saved in file: %s" %
                     saver.save(sess, args.model_path))
Exemple #26
0
embedding_size = 100
embedding_file = 'data/glove.6B/glove.6B.50d.txt'
hidden_size = 128
embedding_file = None
dropout_rate = 0.2
learning_rate=0.05
eval_iter = 10
batch_size = 10


file_name = '/Users/yangsun/Desktop/dataset/training_cnn.txt'
val_file_name = '/Users/yangsun/Desktop/dataset/validation_cnn.txt'
model_path = './model_path'

documents, questions, answers = utils.load_data(file_name, 10)
word_dict = utils.build_dict(documents + questions)

documents_val, questions_val, answers_val = utils.load_data(val_file_name, 100)
word_dict_val = utils.build_dict(documents_val + questions_val)

entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + answers))


entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
num_labels = len(entity_dict)
embeddings = utils.gen_embeddings(word_dict, embedding_size, embedding_file)
vocab_size, embedding_size = embeddings.shape


# tf.reset_default_graph()
Exemple #27
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    question_belong = []
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 100)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100)
        test_examples = dev_examples
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file)
        test_examples = utils.load_data(args.test_file)
    args.num_train = len(train_examples)
    args.num_dev = len(dev_examples)
    args.relations = len(train_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat)
    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat)
    default_value = []
    for word_dict in word_dicts:
        default_value.append(word_dict[''])
    #logging.info(word_dicts[1])
    #logging.info(inv_word_dicts[1])

    #utils.store_labels_to_pkl(inv_word_dicts)
    #sys.exit(0)
    args.default_value = default_value
    embeddings = utils.gen_embeddings(word_dicts, args.embedding_size)
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    topk_acc=args.topk_accuracy
    #topk_acc=1

    labels_data=[]
    if args.test_print_allowed:
        labels_data=pickle.load(open(labels_file, 'rb')) 

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args)
    all_dev = gen_examples(dev_data, dev_mask, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
    logging.info('Dev accuracy: %s %%' % str(dev_acc))
    test_data, test_mask = utils.vectorize(test_examples, word_dicts, args, args.test_print_allowed, labels_data)
    all_test = gen_examples(test_data, test_mask, args.batch_size)
    test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc, args.test_print_allowed, labels_data)
    logging.info('Test accuracy: %s %%' % str(test_acc))
    best_acc = dev_acc
    if args.test_only:
        return
    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    #utils.store_labels_to_pkl(inv_word_dicts)
    # Training
    if args.num_epoches>0:
        logging.info('-' * 50)
        logging.info('Start training..')
        train_data, train_mask = utils.vectorize(train_examples, word_dicts, args)
        start_time = time.time()
        n_updates = 0
        all_train_old = gen_examples(train_data, train_mask, args.batch_size)

        all_train=utils.oversample(all_train_old, args)

        no_progress=0
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, inps in enumerate(all_train):
            train_loss = train_fn(*inps)
            if idx % 1000 == 0:
                #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
                logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1
            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                train_data_sample = [train_data[j][samples] for j in range(args.relations)]
                train_mask_sample = [train_mask[j][samples] for j in range(args.relations)]
                sample_train = gen_examples(train_data_sample, train_mask_sample, args.batch_size)
                #acc = eval_acc(test_fn, sample_train)
                #logging.info('Train accuracy: %s %%' % str(acc))
                dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc)
                logging.info('Dev accuracy: %s %%' % str(dev_acc))
                #test_acc = eval_acc(test_fn, all_test)
                #logging.info('Test accuracy: %s %%' % str(test_acc))
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info('Best dev accuracy!')
                    utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
                    no_progress=0
                else:
                    no_progress+=1
                    logging.info('Dev accuracy has not improved in the past %d evaluations' % no_progress)
                    if no_progress>=MAX_NO_PROGRESS:
                        logging.info("Reached the limit of stagnation. Exiting now...")
                        sys.exit(0)
Exemple #28
0
def train(topology,
          train_data_dir=None,
          test_data_dir=None,
          word_dict_path=None,
          label_dict_path=None,
          model_save_dir="models",
          batch_size=32,
          num_passes=10):

    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    use_default_data = (train_data_dir is None)

    if use_default_data:
        logger.info(("No training data are provided, "
                     "use paddle.dataset.imdb to train the model."))
        logger.info("please wait to build the word dictionary ...")

        word_dict = paddle.dataset.imdb.word_dict()
        train_reader = paddle.batch(paddle.reader.shuffle(
            lambda: paddle.dataset.imdb.train(word_dict)(), buf_size=51200),
                                    batch_size=100)
        test_reader = paddle.batch(lambda: paddle.dataset.imdb.test(word_dict)
                                   (),
                                   batch_size=100)

        class_num = 2
    else:
        if word_dict_path is None or not os.path.exists(word_dict_path):
            logger.info(("word dictionary is not given, the dictionary "
                         "is automatically built from the training data."))

            build_dict(data_dir=train_data_dir,
                       save_path=word_dict_path,
                       use_col=1,
                       cutoff_fre=5,
                       insert_extra_words=["<UNK>"])

        if not os.path.exists(label_dict_path):
            logger.info(("label dictionary is not given, the dictionary "
                         "is automatically built from the training data."))
            # build the label dictionary to map the original string-typed
            # label into integer-typed index
            build_dict(data_dir=train_data_dir,
                       save_path=label_dict_path,
                       use_col=0)

        word_dict = load_dict(word_dict_path)

        lbl_dict = load_dict(label_dict_path)
        class_num = len(lbl_dict)
        logger.info("class number is : %d." % (len(lbl_dict)))

        train_reader = paddle.batch(paddle.reader.shuffle(reader.train_reader(
            train_data_dir, word_dict, lbl_dict),
                                                          buf_size=51200),
                                    batch_size=batch_size)

        if test_data_dir is not None:
            # here, because training and testing data share a same format,
            # we still use the reader.train_reader to read the testing data.
            test_reader = paddle.batch(reader.train_reader(
                test_data_dir, word_dict, lbl_dict),
                                       batch_size=batch_size)
        else:
            test_reader = None

    dict_dim = len(word_dict)
    logger.info("length of word dictionary is : %d." % (dict_dim))

    paddle.init(use_gpu=False, trainer_count=1)

    # network config
    cost, prob, label = topology(dict_dim, class_num)

    # create parameters
    parameters = paddle.parameters.create(cost)

    # create optimizer
    adam_optimizer = paddle.optimizer.Adam(
        learning_rate=1e-3,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3),
        model_average=paddle.optimizer.ModelAverage(average_window=0.5))

    # create trainer
    trainer = paddle.trainer.SGD(cost=cost,
                                 extra_layers=paddle.evaluator.auc(
                                     input=prob, label=label),
                                 parameters=parameters,
                                 update_equation=adam_optimizer)

    # begin training network
    feeding = {"word": 0, "label": 1}

    def _event_handler(event):
        """
        Define end batch and end pass event handler
        """
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                logger.info(
                    "Pass %d, Batch %d, Cost %f, %s\n" %
                    (event.pass_id, event.batch_id, event.cost, event.metrics))

        if isinstance(event, paddle.event.EndPass):
            if test_reader is not None:
                result = trainer.test(reader=test_reader, feeding=feeding)
                logger.info("Test at Pass %d, %s \n" %
                            (event.pass_id, result.metrics))
            with gzip.open(
                    os.path.join(model_save_dir,
                                 "cnn_params_pass_%05d.tar.gz" %
                                 event.pass_id), "w") as f:
                trainer.save_parameter_to_tar(f)

    trainer.train(reader=train_reader,
                  event_handler=_event_handler,
                  feeding=feeding,
                  num_passes=num_passes)

    logger.info("Training has finished.")
Exemple #29
0
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')

    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file, relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(train_examples[0] + train_examples[1])
    entity_markers = list(set([w for w in word_dict.keys()
                              if w.startswith('@entity')] + train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    logging.info('Entity markers: %d' % len(entity_dict))
    args.num_labels = len(entity_dict)

    logging.info('-' * 50)
    # Load embedding file
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params = build_fn(args, embeddings)
    logging.info('Done.')

    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict)
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size)
    dev_acc = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.test_only:
        return

    utils.save_params(args.model_file, params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_l, train_y = utils.vectorize(train_examples, word_dict, entity_dict)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train):
            logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1]))
            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
            logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' %
                         (epoch, idx, len(all_train), train_loss, time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev),
                                                  replace=False))
                sample_train = gen_examples([train_x1[k] for k in samples],
                                            [train_x2[k] for k in samples],
                                            train_l[samples],
                                            [train_y[k] for k in samples],
                                            args.batch_size)
                logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train))
                logging.info('Dev accuracy: %.2f %%' % eval_acc(test_fn, all_dev))
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                                 % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
Exemple #30
0
def main(args):

	# load sentences (English and Chinese words)
	train_en, train_cn = utils.load_data(args.train_file)
	dev_en, dev_cn = utils.load_data(args.dev_file)
	args.num_train = len(train_en)
	args.num_dev = len(dev_en)

	# build English and Chinese dictionary
	if os.path.isfile(args.vocab_file):
		en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb"))
	else:
		en_dict, en_total_words = utils.build_dict(train_en)
		cn_dict, cn_total_words = utils.build_dict(train_cn)
		pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb"))

	args.en_total_words = en_total_words
	args.cn_total_words = cn_total_words
	# index to words dict
	inv_en_dict = {v: k for k, v in en_dict.items()}
	inv_cn_dict = {v: k for k, v in cn_dict.items()}

	# encode train and dev sentences into indieces
	train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
	# convert to numpy tensors
	train_data = utils.gen_examples(train_en, train_cn, args.batch_size)

	dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)
	dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

	# code.interact(local=locals())

	if os.path.isfile(args.model_file):
		model = torch.load(args.model_file)
	elif args.model == "EncoderDecoderModel":
		model = EncoderDecoderModel(args)

	if args.use_cuda:
		model = model.cuda()

	crit = utils.LanguageModelCriterion()

	print("start evaluating on dev...")
	correct_count, loss, num_words = eval(model, dev_data, args, crit)

	loss = loss / num_words
	acc = correct_count / num_words
	print("dev loss %s" % (loss) )
	print("dev accuracy %f" % (acc))
	print("dev total number of words %f" % (num_words))
	best_acc = acc

	learning_rate = args.learning_rate
	optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate)
	
	total_num_sentences = 0.
	total_time = 0.
	for epoch in range(args.num_epoches):
		np.random.shuffle(train_data)
		total_train_loss = 0.
		total_num_words = 0.
		for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)):

			batch_size = mb_x.shape[0]
			total_num_sentences += batch_size
			# convert numpy ndarray to PyTorch tensors and variables
			mb_x = Variable(torch.from_numpy(mb_x)).long()
			mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
			hidden = model.init_hidden(batch_size)
			mb_input = Variable(torch.from_numpy(mb_y[:,:-1])).long()
			mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
			mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:]))

			if args.use_cuda:
				mb_x = mb_x.cuda()
				mb_x_mask = mb_x_mask.cuda()
				mb_input = mb_input.cuda()
				mb_out = mb_out.cuda()
				mb_out_mask = mb_out_mask.cuda()
			
			mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

			loss = crit(mb_pred, mb_out, mb_out_mask)
			num_words = torch.sum(mb_out_mask).data[0]
			total_train_loss += loss.data[0] * num_words
			total_num_words += num_words
	
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()
		print("training loss: %f" % (total_train_loss / total_num_words))

		# evaluate every eval_epoch
		if (epoch+1) % args.eval_epoch == 0:
			

			print("start evaluating on dev...")
	
			correct_count, loss, num_words = eval(model, dev_data, args, crit)

			loss = loss / num_words
			acc = correct_count / num_words
			print("dev loss %s" % (loss) )
			print("dev accuracy %f" % (acc))
			print("dev total number of words %f" % (num_words))

			# save model if we have the best accuracy
			if acc >= best_acc:
				torch.save(model, args.model_file)
				best_acc = acc

				print("model saved...")
			else:
				learning_rate *= 0.5
				optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate)

			print("best dev accuracy: %f" % best_acc)
			print("#" * 60)

	# load test data
	test_en, test_cn = utils.load_data(args.test_file)
	args.num_test = len(test_en)
	test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict)
	test_data = utils.gen_examples(test_en, test_cn, args.batch_size)

	# evaluate on test
	correct_count, loss, num_words = eval(model, test_data, args, crit)
	loss = loss / num_words
	acc = correct_count / num_words
	print("test loss %s" % (loss) )
	print("test accuracy %f" % (acc))
	print("test total number of words %f" % (num_words))

	# evaluate on train
	correct_count, loss, num_words = eval(model, train_data, args, crit)
	loss = loss / num_words
	acc = correct_count / num_words
	print("train loss %s" % (loss) )
	print("train accuracy %f" % (acc))
Exemple #31
0
from __future__ import print_function
from textblob import TextBlob
import utils

if __name__ == "__main__":
    result = utils.ReadDirsToStem("../20news-18828/alt.atheism")

    with open(file="../data/tf.txt", mode="w", encoding="ISO-8859-1") as f:
        f.write(str(result))

    with open(file="../data/tf.txt", mode="r", encoding="ISO-8859-1") as f:
        result_new = eval(f.read())
        utils.build_dict(result_new, frequency=50)
        print(len(utils.load_dict(filePath="../data/dict_50.txt")))
def main(args):
    logging.info('-' * 50)
    logging.info('Load data files..')
    question_belong = []
    if args.debug:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         100,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       100,
                                       relabeling=args.relabeling,
                                       question_belong=question_belong)
    else:
        logging.info('*' * 10 + ' Train')
        train_examples = utils.load_data(args.train_file,
                                         relabeling=args.relabeling)
        logging.info('*' * 10 + ' Dev')
        dev_examples = utils.load_data(args.dev_file,
                                       args.max_dev,
                                       relabeling=args.relabeling,
                                       question_belong=question_belong)

    args.num_train = len(train_examples[0])
    args.num_dev = len(dev_examples[0])

    logging.info('-' * 50)
    logging.info('Build dictionary..')
    word_dict = utils.build_dict(
        train_examples[0] + train_examples[1] + train_examples[2],
        args.max_vocab_size)

    logging.info('-' * 50)
    embeddings = utils.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)
    (args.vocab_size, args.embedding_size) = embeddings.shape
    logging.info('Compile functions..')
    train_fn, test_fn, params, all_params = build_fn(args, embeddings)
    logging.info('Done.')
    logging.info('-' * 50)
    logging.info(args)

    logging.info('-' * 50)
    logging.info('Intial test..')
    dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize(
        dev_examples,
        word_dict,
        sort_by_len=not args.test_only,
        concat=args.concat)
    word_dict_r = {}
    word_dict_r[0] = "unk"
    assert len(dev_x1) == args.num_dev
    all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size,
                           args.concat)
    dev_acc, pred = eval_acc(test_fn, all_dev)
    logging.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc
    if args.test_only:
        return
    utils.save_params(args.model_file, all_params, epoch=0, n_updates=0)

    # Training
    logging.info('-' * 50)
    logging.info('Start training..')
    train_x1, train_x2, train_x3, train_y = utils.vectorize(train_examples,
                                                            word_dict,
                                                            concat=args.concat)
    assert len(train_x1) == args.num_train
    start_time = time.time()
    n_updates = 0

    all_train = gen_examples(train_x1, train_x2, train_x3, train_y,
                             args.batch_size, args.concat)
    for epoch in range(args.num_epoches):
        np.random.shuffle(all_train)
        for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3,
                  mb_y) in enumerate(all_train):

            train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3,
                                  mb_mask3, mb_y)
            if idx % 100 == 0:
                logging.info('#Examples = %d, max_len = %d' %
                             (len(mb_x1), mb_x1.shape[1]))
                logging.info(
                    'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                    % (epoch, idx, len(all_train), train_loss,
                       time.time() - start_time))
            n_updates += 1

            if n_updates % args.eval_iter == 0:
                samples = sorted(
                    np.random.choice(args.num_train,
                                     min(args.num_train, args.num_dev),
                                     replace=False))
                sample_train = gen_examples(
                    [train_x1[k]
                     for k in samples], [train_x2[k] for k in samples],
                    [train_x3[k * 4 + o] for k in samples
                     for o in range(4)], [train_y[k] for k in samples],
                    args.batch_size, args.concat)
                acc, pred = eval_acc(test_fn, sample_train)
                logging.info('Train accuracy: %.2f %%' % acc)
                dev_acc, pred = eval_acc(test_fn, all_dev)
                logging.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logging.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, dev_acc))
                    utils.save_params(args.model_file,
                                      all_params,
                                      epoch=epoch,
                                      n_updates=n_updates)
                    type=str)
parser.add_argument('--test_date_path',
                    default='./dateset/cnn/questions/test',
                    type=str)
parser.add_argument('--glove_path',
                    default='/nfs/users/guanxin/cache/.vector_cache',
                    type=str)
config = parser.parse_args()

documents, questions, answers, doc_len, qus_len = utils.load_data(
    config.train_date_path, config.train_num, True)
test_documents, test_questions, test_answers, test_doc_len, test_qus_len = utils.load_data(
    config.test_date_path, 3000, True)

# build word dict
word_dict = utils.build_dict(documents + questions)
embedding = Parameter(utils.embedding_word(word_dict, config.glove_path))

# build entity dict (numbers of categories)
entity_markers = list(
    set([w for w in word_dict.keys() if w.startswith('@entity')] + answers))
entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}

doc_maxlen = max(map(len, (d.split(' ') for d in documents)))
query_maxlen = max(map(len, (q.split(' ') for q in questions)))

# data preprocessing, convert to one-hot
train_x1, train_x2, train_l, train_y = utils.vectorize(documents, questions,
                                                       answers, word_dict,
                                                       entity_dict, doc_maxlen,
Exemple #34
0
def train(topology,
          train_data_dir=None,
          test_data_dir=None,
          word_dict_path=None,
          label_dict_path=None,
          model_save_dir="models",
          use_cuda=False,
          window_size=5,
          learning_rate=0.001,
          batch_size=64,
          num_passes=10):
    """
    train window_net model or sentence_net model

    :params train_data_path: path of training data, if this parameter
        is not specified, Brown Corpus will be used to run this example
    :type train_data_path: str
    :params test_data_path: path of testing data, if this parameter
        is not specified, Brown Corpus will be used to run this example
    :type test_data_path: str
    :params word_dict_path: path of word dictionary data, if this parameter
        is not specified, a default dictionary file will be used to run this example
    :type word_dict_path: str
    :params label_dict_path: path of label dictionary data, if this parameter
        is not specified, a default dictionary file will be used to run this example
    :type label_dict_path: str
    :params use_cuda: whether use the cuda
    :type use_cuda: bool
    :params window_size: size of window width
    :type window_size: int
    :params num_pass: train pass number
    :type num_pass: int
    """

    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    use_default_data = (train_data_dir is None)

    if use_default_data:
        logger.info(("No training data are provided, "
                     "use Brown corpus to train the model."))

        logger.info("downloading Brown corpus...")
        train_data_dir, test_data_dir, word_dict_path, label_dict_path = load_default_data(
        )

        logger.info("please wait to build the word dictionary ...")

    if word_dict_path is None or not os.path.exists(word_dict_path):
        logger.info(("word dictionary is not given, the dictionary "
                     "is automatically built from the training data."))

        # build the word dictionary to map the original string-typed
        # words into integer-typed index
        build_dict(data_dir=train_data_dir,
                   save_path=word_dict_path,
                   use_col=0,
                   cutoff_fre=1,
                   insert_extra_words=["<UNK>"])
    logger.info("the word dictionary path is %s" % word_dict_path)

    if not os.path.exists(label_dict_path):
        logger.info(("label dictionary is not given, the dictionary "
                     "is automatically built from the training data."))
        # build the label dictionary to map the original string-typed
        # label into integer-typed index
        build_dict(data_dir=train_data_dir,
                   save_path=label_dict_path,
                   use_col=1,
                   cutoff_fre=10,
                   insert_extra_words=["<UNK>"])
    logger.info("the label dictionary path is %s" % label_dict_path)

    # get index info
    word_dict = load_dict(word_dict_path)
    lbl_dict = load_dict(label_dict_path)
    class_num = len(lbl_dict)
    logger.info("class number is : %d." % (len(lbl_dict)))

    # get train data reader
    train_reader = paddle.batch(paddle.reader.shuffle(reader.train_reader(
        train_data_dir, word_dict, lbl_dict, window_size),
                                                      buf_size=51200),
                                batch_size=batch_size)

    # get test data reader
    if test_data_dir is not None:
        # here, because training and testing data share a same format,
        # we still use the reader.train_reader to read the testing data.
        test_reader = paddle.batch(reader.train_reader(test_data_dir,
                                                       word_dict, lbl_dict,
                                                       window_size),
                                   batch_size=batch_size)
    else:
        test_reader = None

    # get size of word dictionary
    dict_dim = len(word_dict) + 1
    logger.info("length of word dictionary is : %d." % (dict_dim))

    # define the input layers
    data = fluid.layers.data(name="words",
                             shape=[1],
                             dtype="int64",
                             lod_level=1)

    label = fluid.layers.data(name="label", shape=[1], dtype="int64")

    # return the network result
    cost, acc, prediction = topology(data,
                                     label,
                                     dict_dim,
                                     class_num=class_num)

    # create optimizer
    sgd_optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
    sgd_optimizer.minimize(cost)

    # create trainer
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)

    # initialize training network
    exe.run(fluid.default_startup_program())
    prog = fluid.default_main_program()

    # begin training network
    for pass_id in range(num_passes):

        ## running the train data
        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
        for i, data_ in enumerate(train_reader()):
            avg_cost_np, avg_acc_np = exe.run(prog,
                                              feed=feeder.feed(data_),
                                              fetch_list=[cost, acc])
            data_size = len(data_)
            total_acc += data_size * avg_acc_np
            total_cost += data_size * avg_cost_np
            data_count += data_size

            if (i + 1) % 1000 == 0:
                logger.info(
                    "pass_id: %d, batch %d, avg_acc: %f, avg_cost: %f" %
                    (pass_id, i + 1, total_acc / data_count,
                     total_cost / data_count))

        avg_cost = total_cost / data_count
        avg_acc = total_acc / data_count
        logger.info("Train result -- pass_id: %d,  avg_acc: %f, avg_cost: %f" %
                    (pass_id, avg_acc, avg_cost))

        ## running the test data
        if test_reader is not None:
            data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
            for i, data in enumerate(test_reader()):
                avg_cost_np, avg_acc_np, prediction_np = exe.run(
                    prog,
                    feed=feeder.feed(data),
                    fetch_list=[cost, acc, prediction])
                data_size = len(data)
                total_acc += data_size * avg_acc_np
                total_cost += data_size * avg_cost_np
                data_count += data_size

            avg_cost = total_cost / data_count
            avg_acc = total_acc / data_count
            logger.info(
                "Test result -- pass_id: %d,  avg_acc: %f, avg_cost: %f" %
                (pass_id, avg_acc, avg_cost))

        ## save inference model
        epoch_model = model_save_dir + "/" + args.nn_type + "_epoch" + str(
            pass_id % 5)
        logger.info("Saving inference model at %s" % (epoch_model))

        ##prediction is the topology return value
        ##if we use the prediction value as the infer result
        fluid.io.save_inference_model(epoch_model, ["words"], prediction, exe)

    logger.info("Training has finished.")
import tensorflow as tf
import pickle
from model import Model
from utils import build_dict, build_dataset, batch_iter, build_deploy
import numpy as np
import time

t2 = time.time()

with open("args.pickle", "rb") as f:
    args = pickle.load(f)

str_from = 'five-time world champion michelle kwan withdrew from the #### us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the #### turin olympics .'

print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict(
    "valid", args.toy)
valid_x, valid_y = build_deploy(str_from, word_dict, article_max_len,
                                summary_max_len)
valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x))

sess = tf.InteractiveSession()
print("Loading saved model...")
t1 = time.time()
model = Model(reversed_dict,
              article_max_len,
              summary_max_len,
              args,
              forward_only=True)
saver = tf.train.Saver(tf.global_variables())
ckpt = tf.train.get_checkpoint_state("./saved_model/")
saver.restore(sess, ckpt.model_checkpoint_path)
Exemple #36
0
print('Processing train files')
train_sentences, train_labels = utils.get_sen_and_labels(train_files)
print('Processing val files')
val_sentences, val_labels = utils.get_sen_and_labels(val_files)

train_size = len(train_sentences)
val_size = len(val_sentences)
print_after = train_size // (FLAGS.num_gpus * FLAGS.batch_size)
val_steps = val_size // (FLAGS.num_gpus * FLAGS.batch_size)
max_steps = FLAGS.num_epochs * print_after

sentences = train_sentences + val_sentences
labels = train_labels + val_labels

word2idx, idx2word = utils.build_dict(sentences, True)
label2idx, idx2label = utils.build_dict(labels, False)
vocabulary_size = len(word2idx)

train_gen = utils.batches_generator(train_size, train_sentences, train_labels,
                                    word2idx, label2idx)
val_gen = utils.batches_generator(val_size, val_sentences, val_labels,
                                  word2idx, label2idx)

X_train, y_train = next(train_gen)
X_val, y_val = next(val_gen)

#print(X_train, y_train)

assert X_train.shape[0] == y_train.shape[0], 'train vectors shape mismatch'
assert X_val.shape[0] == y_val.shape[0], 'val vectors shape mismatch'