Ejemplo n.º 1
0
def extract_keywords(sentences, k=5):
    filtered_sentences = filter_sentences(sentences, lowercase=False, stem=False)

    word_to_ix, ix_to_word = build_vocabulary(filtered_sentences)

    S = build_coo_matrix(filtered_sentences, word_to_ix)

    ranks = pagerank(S)

    return get_topk_keywords(ranks, ix_to_word, k)
Ejemplo n.º 2
0
def main(rnn_type="rnn"):
    from data import loop_data, build_vocabulary, batchify

    np.random.seed(11)

    batch_size = 32
    n_steps = 20
    lr = 0.01
    lr_decay = 0.5

    train_text, valid_text = loop_data()

    vocab, rev_vocab = build_vocabulary(train_text)
    vocab_size = len(vocab)
    print "vocab size:", vocab_size

    model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type)

    # TODO: sample decoded sentence
    with tf.Session() as sess:
        tf.initialize_all_variables().run()

        prev_epoch_cost = 9999999  # arbitarily large number
        for epoch in range(5):
            print "epoch", epoch
            print "learning rate", lr

            list_of_costs = []
            model.assign_lr(sess, lr)

            for idx, (x, y) in tqdm(
                    enumerate(batchify(train_text, vocab, batch_size,
                                       n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=True))
                if idx % 100 == 0:
                    print "cost", 2**np.mean(list_of_costs)
                    list_of_costs = []

            epoch_cost = np.mean(list_of_costs)
            print "train cost", 2**epoch_cost

            list_of_costs = []
            for idx, (x, y) in tqdm(
                    enumerate(batchify(valid_text, vocab, batch_size,
                                       n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=False))

            epoch_cost = np.mean(list_of_costs)
            print "valid cost", 2**epoch_cost

            if epoch_cost > prev_epoch_cost:
                lr *= lr_decay
            prev_epoch_cost = epoch_cost
Ejemplo n.º 3
0
def main(rnn_type="rnn"):
    from data import loop_data, build_vocabulary, batchify

    np.random.seed(11)

    batch_size = 32
    n_steps = 20
    lr = 0.01
    lr_decay = 0.5

    train_text, valid_text = loop_data()

    vocab, rev_vocab = build_vocabulary(train_text)
    vocab_size = len(vocab)
    print "vocab size:", vocab_size

    model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type)

    # TODO: sample decoded sentence
    with tf.Session() as sess:
        tf.initialize_all_variables().run()

        prev_epoch_cost = 9999999  # arbitarily large number
        for epoch in range(5):
            print "epoch", epoch
            print "learning rate", lr

            list_of_costs = []
            model.assign_lr(sess, lr)

            for idx, (x, y) in tqdm(enumerate(batchify(train_text, vocab, batch_size, n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=True))
                if idx % 100 == 0:
                    print "cost", 2 ** np.mean(list_of_costs)
                    list_of_costs = []

            epoch_cost = np.mean(list_of_costs)
            print "train cost", 2 ** epoch_cost

            list_of_costs = []
            for idx, (x, y) in tqdm(enumerate(batchify(valid_text, vocab, batch_size, n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=False))

            epoch_cost = np.mean(list_of_costs)
            print "valid cost", 2 ** epoch_cost

            if epoch_cost > prev_epoch_cost:
                lr *= lr_decay
            prev_epoch_cost = epoch_cost
Ejemplo n.º 4
0
        print(f"Ignoring the model arguments and loading the "
              f"model from serialization_dir: {args.load_serialization_dir}")

        # Load Vocab
        vocab_path = os.path.join(args.load_serialization_dir, "vocab.txt")
        vocab_token_to_id, vocab_id_to_token = load_vocabulary(vocab_path)

        # Load Model
        classifier = load_pretrained_model(args.load_serialization_dir)
    else:
        # Build Vocabulary
        with open(GLOVE_COMMON_WORDS_PATH, encoding='utf8') as file:
            glove_common_words = [
                line.strip() for line in file.readlines() if line.strip()
            ]
        vocab_token_to_id, vocab_id_to_token = build_vocabulary(
            train_instances, VOCAB_SIZE, glove_common_words)

        # Build Config and Model
        if args.model_name == "main":
            config = {
                "seq2vec_choice": args.seq2vec_choice,
                "vocab_size": min(VOCAB_SIZE, len(vocab_token_to_id)),
                "embedding_dim": args.embedding_dim,
                "num_layers": args.num_layers
            }
            classifier = MainClassifier(**config)
            config["type"] = "main"
        else:
            config = {
                "pretrained_model_path": args.base_model_dir,
                "layer_num": args.layer_num,
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNNs for Poetry Generation')
    # data arguments
    parser.add_argument('--datadir',
                        default='data',
                        help='path to dataset',
                        type=str)
    parser.add_argument('--rawdir',
                        default=None,
                        help='path to raw dataset',
                        type=str)
    parser.add_argument('--logdir',
                        default='log',
                        help='path to log',
                        type=str)
    parser.add_argument('--tag',
                        default='tang',
                        help='poetry type for the project.',
                        type=str)
    parser.add_argument('--wordnum',
                        default=5,
                        help='The number of poetry words in the sentences.',
                        type=int)
    parser.add_argument('--sentnum',
                        default=4,
                        help='The number of poetry sentences.',
                        type=int)
    parser.add_argument('--max-len',
                        default=20,
                        help='The number of poetry titles.',
                        type=int)
    parser.add_argument('--embedding-dim',
                        default=300,
                        help='The dimension of embedding .',
                        type=int)
    parser.add_argument('--hidden-dim',
                        default=150,
                        help='The dimension of hidden .',
                        type=int)
    parser.add_argument('--num_layers',
                        default=2,
                        help='The rnn layers.',
                        type=int)
    parser.add_argument('--batch-size',
                        default=30,
                        help='The batch-size of the dataset.',
                        type=int)
    parser.add_argument('--data-workers',
                        type=int,
                        default=5,
                        help='Number of subprocesses for data loading')
    parser.add_argument('--epoches',
                        default=50,
                        help='The batch-size of the dataset.',
                        type=int)
    parser.add_argument('--bidirectional',
                        action='store_true',
                        help='Whether using bidirectional RNNs')
    parser.add_argument('--lr',
                        default=0.001,
                        type=float,
                        metavar='LR',
                        help='initial learning rate')
    parser.add_argument('--seed',
                        default=123,
                        type=int,
                        help='random seed (default: 123)')
    cuda_parser = parser.add_mutually_exclusive_group(required=False)
    cuda_parser.add_argument('--cuda', dest='cuda', action='store_true')
    cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false')
    parser.set_defaults(cuda=True)
    args = parser.parse_args()
    # preparing log
    # logging defination
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    model_name = time.strftime("%Y%m%d%H%M", time.localtime(time.time()))
    log_dir = os.path.join(
        os.getcwd(),
        args.logdir,
    )
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    log_file = os.path.join(log_dir, model_name + ".log")
    fh = logging.FileHandler(log_file, mode="w")
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
    )
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.info(args)
    args.cuda = args.cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    assert (args.rawdir is not None)
    # preparing dataset
    poetry_path = os.path.join(
        args.datadir,
        "poet.%s._%d_%d.json" % (args.tag, args.sentnum, args.wordnum))
    if os.path.exists(poetry_path):
        logger.info("The poetry dataset has been built in path: %s" %
                    poetry_path)
    else:
        logger.info("Preparing poetry...")
        processPoetry(args.rawdir,
                      args.datadir,
                      sentNum=args.sentnum,
                      wordsNum=args.wordnum,
                      max_title_len=args.max_len,
                      tag=args.tag)
        logger.info("Poetry processed!")
    # preparing vocabulary
    vocab_path = os.path.join(args.datadir, "vocab.txt")
    if os.path.exists(vocab_path):
        logger.info("The vocabulary has been built in path: %s" %
                    os.path.join(args.datadir, "vocab.txt"))
    else:
        logger.info("Building vocabulary...")
        build_vocabulary(args.rawdir, args.datadir)
        logger.info("The vocabulary has been built.")
    VocabDataSet = Vocabulary(vocab_path)

    PoetryDataSet = Poetry(VocabDataSet, args.max_len, poetry_path)

    # preparing model

    model = LSTMPoetry(vocab_size=len(VocabDataSet),
                       embedding_dim=args.embedding_dim,
                       hidden_dim=args.hidden_dim,
                       sents_len=args.sentnum,
                       num_layers=args.num_layers,
                       name=model_name)
    criterion = torch.nn.CrossEntropyLoss()
    model.to(device), criterion.to(device)
    optimizer = torch.optim.Adam(model.parameters())
    # training process
    logger.info("Begin training model!")
    train(model, PoetryDataSet, criterion, optimizer, args, device)
    logger.info("End training model!")
Ejemplo n.º 6
0
test_sourse = get_DataSet_on_numpy(subset = "test")

#    1. определим, какое колличество наборов мы имеем,
#    2. сравним количество обучающих и тестовых меток,
#     чтобы убедиться, что данные спарсились верно

EDA.print_count_texts_of_DS(train_sourse,test_sourse)


#   разбиение текстов на токены: буквы, цифры больше 4 символов
#   получаем две матрицы: строки(тексты) * столбцы (признаки)
train_tokinized = tokenize_corpus(train_sourse['data'])
test_tokinized = tokenize_corpus(test_sourse['data'])

#   выведем пример одного набора данных
EDA.print_texr_example(train_tokinized[0])

# строим словарь: слова -> цифры (нумеруем токены)
MAX_DF = 0.8
MIN_COUNT = 5
UNIQUE_LABELS_N = len(set(train_sourse['target']))
vocabulary, word_doc_freq = build_vocabulary (train_tokinized, max_doc_freq = MAX_DF, min_count = MIN_COUNT)

#   выведем количество уникальных токенов и меток
EDA.print_unique_tokin(vocabulary)
#   выведем Распределение относительных частот слов
EDA.show_hist_word_frequency_dist(word_doc_freq)
#   оценим распределение меток в обучающей и тестовой выборках
EDA.show_hist_target_dist(train_sourse,test_sourse)
#   оценим встречаемость слова в наборах (заспамленность текста)
EDA.spamming_of_text(train_sourse, len(vocabulary))