def extract_keywords(sentences, k=5): filtered_sentences = filter_sentences(sentences, lowercase=False, stem=False) word_to_ix, ix_to_word = build_vocabulary(filtered_sentences) S = build_coo_matrix(filtered_sentences, word_to_ix) ranks = pagerank(S) return get_topk_keywords(ranks, ix_to_word, k)
def main(rnn_type="rnn"): from data import loop_data, build_vocabulary, batchify np.random.seed(11) batch_size = 32 n_steps = 20 lr = 0.01 lr_decay = 0.5 train_text, valid_text = loop_data() vocab, rev_vocab = build_vocabulary(train_text) vocab_size = len(vocab) print "vocab size:", vocab_size model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type) # TODO: sample decoded sentence with tf.Session() as sess: tf.initialize_all_variables().run() prev_epoch_cost = 9999999 # arbitarily large number for epoch in range(5): print "epoch", epoch print "learning rate", lr list_of_costs = [] model.assign_lr(sess, lr) for idx, (x, y) in tqdm( enumerate(batchify(train_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=True)) if idx % 100 == 0: print "cost", 2**np.mean(list_of_costs) list_of_costs = [] epoch_cost = np.mean(list_of_costs) print "train cost", 2**epoch_cost list_of_costs = [] for idx, (x, y) in tqdm( enumerate(batchify(valid_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=False)) epoch_cost = np.mean(list_of_costs) print "valid cost", 2**epoch_cost if epoch_cost > prev_epoch_cost: lr *= lr_decay prev_epoch_cost = epoch_cost
def main(rnn_type="rnn"): from data import loop_data, build_vocabulary, batchify np.random.seed(11) batch_size = 32 n_steps = 20 lr = 0.01 lr_decay = 0.5 train_text, valid_text = loop_data() vocab, rev_vocab = build_vocabulary(train_text) vocab_size = len(vocab) print "vocab size:", vocab_size model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type) # TODO: sample decoded sentence with tf.Session() as sess: tf.initialize_all_variables().run() prev_epoch_cost = 9999999 # arbitarily large number for epoch in range(5): print "epoch", epoch print "learning rate", lr list_of_costs = [] model.assign_lr(sess, lr) for idx, (x, y) in tqdm(enumerate(batchify(train_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=True)) if idx % 100 == 0: print "cost", 2 ** np.mean(list_of_costs) list_of_costs = [] epoch_cost = np.mean(list_of_costs) print "train cost", 2 ** epoch_cost list_of_costs = [] for idx, (x, y) in tqdm(enumerate(batchify(valid_text, vocab, batch_size, n_steps))): list_of_costs.append(model.step(sess, x, y, is_train=False)) epoch_cost = np.mean(list_of_costs) print "valid cost", 2 ** epoch_cost if epoch_cost > prev_epoch_cost: lr *= lr_decay prev_epoch_cost = epoch_cost
print(f"Ignoring the model arguments and loading the " f"model from serialization_dir: {args.load_serialization_dir}") # Load Vocab vocab_path = os.path.join(args.load_serialization_dir, "vocab.txt") vocab_token_to_id, vocab_id_to_token = load_vocabulary(vocab_path) # Load Model classifier = load_pretrained_model(args.load_serialization_dir) else: # Build Vocabulary with open(GLOVE_COMMON_WORDS_PATH, encoding='utf8') as file: glove_common_words = [ line.strip() for line in file.readlines() if line.strip() ] vocab_token_to_id, vocab_id_to_token = build_vocabulary( train_instances, VOCAB_SIZE, glove_common_words) # Build Config and Model if args.model_name == "main": config = { "seq2vec_choice": args.seq2vec_choice, "vocab_size": min(VOCAB_SIZE, len(vocab_token_to_id)), "embedding_dim": args.embedding_dim, "num_layers": args.num_layers } classifier = MainClassifier(**config) config["type"] = "main" else: config = { "pretrained_model_path": args.base_model_dir, "layer_num": args.layer_num,
def main(): parser = argparse.ArgumentParser( description='PyTorch RNNs for Poetry Generation') # data arguments parser.add_argument('--datadir', default='data', help='path to dataset', type=str) parser.add_argument('--rawdir', default=None, help='path to raw dataset', type=str) parser.add_argument('--logdir', default='log', help='path to log', type=str) parser.add_argument('--tag', default='tang', help='poetry type for the project.', type=str) parser.add_argument('--wordnum', default=5, help='The number of poetry words in the sentences.', type=int) parser.add_argument('--sentnum', default=4, help='The number of poetry sentences.', type=int) parser.add_argument('--max-len', default=20, help='The number of poetry titles.', type=int) parser.add_argument('--embedding-dim', default=300, help='The dimension of embedding .', type=int) parser.add_argument('--hidden-dim', default=150, help='The dimension of hidden .', type=int) parser.add_argument('--num_layers', default=2, help='The rnn layers.', type=int) parser.add_argument('--batch-size', default=30, help='The batch-size of the dataset.', type=int) parser.add_argument('--data-workers', type=int, default=5, help='Number of subprocesses for data loading') parser.add_argument('--epoches', default=50, help='The batch-size of the dataset.', type=int) parser.add_argument('--bidirectional', action='store_true', help='Whether using bidirectional RNNs') parser.add_argument('--lr', default=0.001, type=float, metavar='LR', help='initial learning rate') parser.add_argument('--seed', default=123, type=int, help='random seed (default: 123)') cuda_parser = parser.add_mutually_exclusive_group(required=False) cuda_parser.add_argument('--cuda', dest='cuda', action='store_true') cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false') parser.set_defaults(cuda=True) args = parser.parse_args() # preparing log # logging defination logger = logging.getLogger() logger.setLevel(logging.INFO) model_name = time.strftime("%Y%m%d%H%M", time.localtime(time.time())) log_dir = os.path.join( os.getcwd(), args.logdir, ) if not os.path.exists(log_dir): os.mkdir(log_dir) log_file = os.path.join(log_dir, model_name + ".log") fh = logging.FileHandler(log_file, mode="w") fh.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" ) fh.setFormatter(formatter) logger.addHandler(fh) logger.info(args) args.cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True assert (args.rawdir is not None) # preparing dataset poetry_path = os.path.join( args.datadir, "poet.%s._%d_%d.json" % (args.tag, args.sentnum, args.wordnum)) if os.path.exists(poetry_path): logger.info("The poetry dataset has been built in path: %s" % poetry_path) else: logger.info("Preparing poetry...") processPoetry(args.rawdir, args.datadir, sentNum=args.sentnum, wordsNum=args.wordnum, max_title_len=args.max_len, tag=args.tag) logger.info("Poetry processed!") # preparing vocabulary vocab_path = os.path.join(args.datadir, "vocab.txt") if os.path.exists(vocab_path): logger.info("The vocabulary has been built in path: %s" % os.path.join(args.datadir, "vocab.txt")) else: logger.info("Building vocabulary...") build_vocabulary(args.rawdir, args.datadir) logger.info("The vocabulary has been built.") VocabDataSet = Vocabulary(vocab_path) PoetryDataSet = Poetry(VocabDataSet, args.max_len, poetry_path) # preparing model model = LSTMPoetry(vocab_size=len(VocabDataSet), embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, sents_len=args.sentnum, num_layers=args.num_layers, name=model_name) criterion = torch.nn.CrossEntropyLoss() model.to(device), criterion.to(device) optimizer = torch.optim.Adam(model.parameters()) # training process logger.info("Begin training model!") train(model, PoetryDataSet, criterion, optimizer, args, device) logger.info("End training model!")
test_sourse = get_DataSet_on_numpy(subset = "test") # 1. определим, какое колличество наборов мы имеем, # 2. сравним количество обучающих и тестовых меток, # чтобы убедиться, что данные спарсились верно EDA.print_count_texts_of_DS(train_sourse,test_sourse) # разбиение текстов на токены: буквы, цифры больше 4 символов # получаем две матрицы: строки(тексты) * столбцы (признаки) train_tokinized = tokenize_corpus(train_sourse['data']) test_tokinized = tokenize_corpus(test_sourse['data']) # выведем пример одного набора данных EDA.print_texr_example(train_tokinized[0]) # строим словарь: слова -> цифры (нумеруем токены) MAX_DF = 0.8 MIN_COUNT = 5 UNIQUE_LABELS_N = len(set(train_sourse['target'])) vocabulary, word_doc_freq = build_vocabulary (train_tokinized, max_doc_freq = MAX_DF, min_count = MIN_COUNT) # выведем количество уникальных токенов и меток EDA.print_unique_tokin(vocabulary) # выведем Распределение относительных частот слов EDA.show_hist_word_frequency_dist(word_doc_freq) # оценим распределение меток в обучающей и тестовой выборках EDA.show_hist_target_dist(train_sourse,test_sourse) # оценим встречаемость слова в наборах (заспамленность текста) EDA.spamming_of_text(train_sourse, len(vocabulary))