Beispiel #1
0
    def setup(self, stage: Optional[str] = None):
        self.src_vocab = Vocab()
        self.src_vocab.load(str(self.data_dir / "src_vocab.json"))
        self.src_vocab_size = len(self.src_vocab)

        self.trg_vocab = Vocab()
        self.trg_vocab.load(str(self.data_dir / "trg_vocab.json"))
        self.trg_vocab_size = len(self.trg_vocab)

        self.train_dataset = AncientPairDataset(
            str(self.data_dir / "train.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )
        self.valid_dataset = AncientPairDataset(
            str(self.data_dir / "valid.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )
        self.test_dataset = AncientPairDataset(
            str(self.data_dir / "test.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )

        logger.info(
            f"数据集信息:\n\t"
            f"训练集: {len(self.train_dataset)}, "
            f"验证集: {len(self.valid_dataset)}, "
            f"测试集: {len(self.test_dataset)}", )
    def __init__(self,
                 train_domain_A_path,
                 test_domain_A_path,
                 train_domain_B_path,
                 test_domain_B_path,
                 name='ShakespeareModern',
                 mode='train'):
        self.train_domain_A_path = train_domain_A_path
        self.test_domain_A_path = test_domain_A_path

        self.train_domain_B_path = train_domain_B_path
        self.test_domain_B_path = test_domain_B_path

        self.vocab = Vocab(name)
        self.mode = mode

        self.domain_A_max_len = 0
        self.domain_B_max_len = 0

        self.train_domain_A_data = self.load_and_preprocess_data(
            self.train_domain_A_path, domain='A')
        self.test_domain_A_data = self.load_and_preprocess_data(
            self.test_domain_A_path, domain='A')

        self.train_domain_B_data = self.load_and_preprocess_data(
            self.train_domain_B_path, domain='B')
        self.test_domain_B_data = self.load_and_preprocess_data(
            self.test_domain_B_path, domain='B')
Beispiel #3
0
 def build_vocab(self, vocab_size, min_freq, specials):
     counter = Counter()
     for t in self.dataset:
         tokens = self.tokenize(t)
         counter.update(tokens)
     vocab = Vocab.from_counter(counter=counter,
                                vocab_size=vocab_size,
                                min_freq=min_freq,
                                specials=specials)
     return vocab
Beispiel #4
0
def load_word_vectors(embeddings_path):
    if os.path.isfile(embeddings_path + '.pth') and \
            os.path.isfile(embeddings_path + '.vocab'):
        print('==> File found, loading to memory')
        vectors = torch.load(embeddings_path + '.pth')
        vocab = Vocab(filename=embeddings_path + '.vocab')
        return vocab, vectors
    if os.path.isfile(embeddings_path + '.model'):
        model = KeyedVectors.load(embeddings_path + ".model")
    if os.path.isfile(embeddings_path + '.vec'):
        model = FastText.load_word2vec_format(embeddings_path + '.vec')
    list_of_tokens = model.vocab.keys()
    vectors = torch.zeros(len(list_of_tokens), model.vector_size)
    with open(embeddings_path + '.vocab', 'w', encoding='utf-8') as f:
        for token in list_of_tokens:
            f.write(token+'\n')
    vocab = Vocab(filename=embeddings_path + '.vocab')
    for index, word in enumerate(list_of_tokens):
        vectors[index, :] = torch.from_numpy(model[word])
    return vocab, vectors
Beispiel #5
0
def create_full_dataset(args):
    train_dir = 'training-treebank'
    vocab_file = 'tmp/vocab.txt'
    build_vocab(
        [
            'training-treebank/rev_sentence.txt',
            'training-treebank/sklad_sentence.txt',
            'test/polevaltest_sentence.txt',
            args.emb_dir + args.emb_file + '.vec'  #full vocabulary in model
        ],
        'tmp/vocab.txt')
    vocab = Vocab(filename=vocab_file)
    full_dataset = SSTDataset(train_dir, vocab, args.num_classes)
    return vocab, full_dataset
def get_vocab(args):
    vocab = Vocab()
    if args.model in ["bert", "mmbt", "concatbert"]:
        bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                                       do_lower_case=True)
        vocab.stoi = bert_tokenizer.vocab
        vocab.itos = bert_tokenizer.ids_to_tokens
        vocab.vocab_sz = len(vocab.itos)

    else:
        word_list = get_glove_words(args.glove_path)
        vocab.add(word_list)

    return vocab
def get_attn_inputs(FLAGS, review, review_len, raw_attn_scores):
    """
    Return the inputs needed to
    plot the attn scores. These include
    input_sentence and attn_scores.
    Args:
        FLAGS: parameters
        review: list of ids
        review_len: len of the relevant review
    Return:
        input_sentence: inputs as tokens (words) on len <review_len>
        plot_attn_scoes: (1, review_len) shaped scores
    """
    # Data paths
    vocab_path = os.path.join(basedir, '../data/vocab.txt')
    vocab = Vocab(vocab_path)

    review = review[:review_len]
    attn_scores = raw_attn_scores[:review_len]
    attn_scores = raw_attn_scores

    # Process input_sentence
    _input_sentence = vc.ids_to_tokens(review, vocab)
    _input_sentence += ['.'] * (max_length - len(_input_sentence))
    input_sentence = ''.join(item for item in _input_sentence)

    print("plot ...........", input_sentence)
    print("plot ...........", attn_scores)
    # Process attn scores (normalize scores between [0,1])
    min_attn_score = min(attn_scores)
    max_attn_score = max(attn_scores)
    normalized_attn_scores = ((attn_scores - min_attn_score) / \
        (max_attn_score - min_attn_score))

    # Reshape attn scores for plotting
    plot_attn_scores = np.zeros((1, max_length))
    for i, score in enumerate(normalized_attn_scores):
        plot_attn_scores[0, i] = score

    #print(plot_attn_scores)
    return input_sentence, plot_attn_scores
Beispiel #8
0
def infer(FLAGS):
    scores = collections.defaultdict(list)
    """
    Infer a previous or new model.
    """
    # Data paths
    vocab_path = os.path.join(basedir, 'data/vocab.txt')
    infer_data_path = os.path.join(basedir, 'data/infer.p')
    vocab = Vocab(vocab_path)

    # Load embeddings (if using GloVe)
    embeddings = np.zeros((len(vocab), FLAGS.emb_size))
    FLAGS.vocab_size = len(vocab)

    with tf.Session() as sess:
        # Create|reload model
        imdb_model = train.create_model(sess, FLAGS, len(vocab))

        for  infer_index, data in \
            enumerate(infer_data(
                infer_data_path, FLAGS.batch_size)):

            comments, skuid = data[0]
            review_lens = data[1]

            logits, prob, label = imdb_model.infer(
                sess=sess,
                batch_reviews=comments,
                batch_review_lens=review_lens,
                embeddings=embeddings,
                keep_prob=1.0,  # no dropout for val|test
            )
            logger.info("[INFER]:  [SKUID] : %s | %s | %s", skuid, label, prob)
            scores[skuid[0]].append(label[0])

    for k, v in scores.items():
        counts = Counter(v)
        db.update_scores(k, int(counts.most_common(1)[0][0]) + 5)
        logger.info("[INFER]:  [SKUID] : %s | %s |%s ", k, v,
                    counts.most_common(1))
def sample_data(data_path):
    """
    Sample format of the processed
    data from data.py
    Args:
        data_path: path for train.p|valid.p
    """
    with open(data_path, 'rb') as f:
        entries = pickle.load(f)

    vocab_file = os.path.join(basedir, 'data/vocab.txt')
    vocab = Vocab(vocab_file, verbose=False)

    for k, v in entries.items():
        rand_index = random.randint(0, len(v[0]))
        print("==> Processed Review:", v[0][rand_index])
        print("==> Review Len:", v[1][rand_index])
        print("==> Label:", k)
        print("==> See if processed review makes sense:",
              vc.ids_to_tokens(
                  v[0][rand_index],
                  vocab=vocab,
              ))
Beispiel #10
0
def build_dataset(data_path, config, is_train, vocab=None, load_vocab=None):
    args = config.data
    if is_train:
        src_txt, tgt_txt = load_dataset(data_path)
        src_train = TextDataset(src_txt, args.src_max_train)
        tgt_train = TextDataset(tgt_txt, args.tgt_max_train)
        if load_vocab is not None:
            vocab = Vocab.from_json(load_vocab)
        else:
            vocab = src_train.build_vocab(
                vocab_size=args.vocab_size,
                min_freq=args.vocab_min_freq,
                specials=[PAD_TOKEN, UNK_TOKEN, START_DECODING, STOP_DECODING])
        dataset = SummDataset(src=src_train, tgt=tgt_train, vocab=vocab)
        return dataset, vocab

    else:
        assert vocab is not None
        src_txt, tgt_txt = load_dataset(data_path)
        src_test = TextDataset(src_txt, args.src_max_test)
        tgt_test = TextDataset(tgt_txt, args.tgt_max_test)
        dataset = SummDataset(src=src_test, tgt=tgt_test, vocab=vocab)
        return dataset
Beispiel #11
0
def infer(FLAGS):
    """
    Infer a previous or new model.
    """
    # Data paths
    vocab_path = os.path.join(basedir, 'data/vocab.txt')
    validation_data_path = os.path.join(basedir, 'data/infer.p')
    vocab = Vocab(vocab_path)

    # Load embeddings (if using GloVe)
    embeddings = np.zeros((len(vocab), FLAGS.emb_size))
    FLAGS.vocab_size = len(vocab)

    with tf.Session() as sess:
        # Create|reload model
        imdb_model = create_model(sess, FLAGS, len(vocab))

        for  infer_index, infer_data in \
            enumerate(infer_data(
                infer_data_path, FLAGS.batch_size)):

            comments, skuid = valid_batch_features
            review_lens = valid_batch_seq_lens

            valid_logits, valid_loss, valid_acc, prob = imdb_model.infer(
                sess=sess,
                batch_reviews=valid_batch_reviews,
                batch_labels=valid_batch_labels,
                batch_review_lens=valid_batch_review_lens,
                embeddings=embeddings,
                keep_prob=1.0,  # no dropout for val|test
            )
            logger.info(
                "[VALID]: %i| [ACC]: %.3f | [LOSS]: %.6f,| [LABELS] : %i |%s",
                valid_batch_num, valid_acc, valid_loss, valid_batch_labels[0],
                prob)
Beispiel #12
0
class AncientPairDataModule(pl.LightningDataModule):
    def __init__(self, batch_size: int, data_dir: str, workers: int):
        super().__init__()

        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        self.workers = workers

        if not self.data_dir.exists():
            raise ValueError("Directory or file doesn't exist")
        if not self.data_dir.is_dir():
            raise ValueError("`data_dir` must be a path to directory")

    @classmethod
    def add_data_args(cls, parent_parser: argparse.ArgumentParser):
        parser = parent_parser.add_argument_group("data")

        parser.add_argument("--data_dir",
                            type=str,
                            default="./data",
                            help="数据存储路径")
        parser.add_argument("--batch_size",
                            type=int,
                            default=128,
                            help="一个batch的大小")
        parser.add_argument("--workers",
                            type=int,
                            default=0,
                            help="读取dataset的worker数")

        cls.parser = parser

        return parent_parser

    def prepare_data(self):
        """数据已提前准备完成"""

    def setup(self, stage: Optional[str] = None):
        self.src_vocab = Vocab()
        self.src_vocab.load(str(self.data_dir / "src_vocab.json"))
        self.src_vocab_size = len(self.src_vocab)

        self.trg_vocab = Vocab()
        self.trg_vocab.load(str(self.data_dir / "trg_vocab.json"))
        self.trg_vocab_size = len(self.trg_vocab)

        self.train_dataset = AncientPairDataset(
            str(self.data_dir / "train.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )
        self.valid_dataset = AncientPairDataset(
            str(self.data_dir / "valid.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )
        self.test_dataset = AncientPairDataset(
            str(self.data_dir / "test.tsv"),
            128,
            self.src_vocab,
            self.trg_vocab,
        )

        logger.info(
            f"数据集信息:\n\t"
            f"训练集: {len(self.train_dataset)}, "
            f"验证集: {len(self.valid_dataset)}, "
            f"测试集: {len(self.test_dataset)}", )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            batch_size=self.batch_size,
            num_workers=self.workers,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=self.workers,
        )
Beispiel #13
0
    __name__,
    template_folder="templates",
    static_folder="./",
    static_url_path="",
)

if "MODEL_DIR" not in os.environ:
    print("MODEL_DIR must be speicified before launching server")
    exit(1)

model_dir = os.environ["MODEL_DIR"]

src_tokenizer = CharTokenizer()
src_tokenizer.load_vocab(os.path.join(model_dir, "src_vocab.json"))

trg_vocab = Vocab()
trg_vocab.load(os.path.join(model_dir, "trg_vocab.json"))

model = ModelInterface.load_from_checkpoint(
    os.path.join(model_dir, "checkpoint.pt"),
    src_vocab=src_tokenizer.vocab,
    trg_vocab=trg_vocab,
    model_name="transformer",
).to("cuda" if torch.cuda.is_available() else "cpu")

model = model.eval()


@app.route("/", methods=["GET"])
def index():
    return render_template("index.html")
Beispiel #14
0
    parser.add_argument('--output', help='output dir file')

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")
    logger.setLevel(logging.INFO)
    brc_data = DatasetReader(
        test_file=args.input,
        bert_dir='/home/wujindou/chinese_L-12_H-768_A-12',  #
        prefix='bert_meizhuang'  #test_file = None,
    )
    from data.vocab import Vocab
    vocab = Vocab(lower=True)
    import sys
    for word in brc_data.word_iter(None):
        vocab.add(word)
        for char in word:
            vocab.add_char(char)
    logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
    logger.info(' vocab size {} '.format(vocab.get_word_vocab()))
    #
    unfiltered_vocab_size = vocab.size()
    unfiltered_char_size = vocab.get_char_vocab_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab.filter_chars_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
# Brief:
#####################################################

from flask import Flask, request, jsonify
from flask import render_template

from tensorflow.python.keras.backend import set_session
import requests
import sys
sys.path.append('../')
import os
app = Flask(__name__)
from data.vocab import Vocab
os.environ["CUDA_VISIBLE_DEVICES"] = " "
vocab_file = '../examples/politic_vocab5.txt'  # vocab.load_from_file('vocab_bool.txt')
vocab = Vocab(lower=True)
from data.data_reader_new import DatasetReader
from model.text_cnn import TextCNN
if os.path.exists(vocab_file): vocab.load_from_file(vocab_file)
print(vocab.get_word_vocab())


@app.route('/')
def search_index():
    return render_template('index.html')


model = TextCNN(vocab,
                num_class=2,
                pretrained_word_embedding=vocab.embeddings,
                word_embedding_size=300)
     #brc_data = DatasetReader(train_file='/home/wujindou/dataset/0905/train_baihuo_category_0905.csv',
     dev_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv',
     test_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv',
     #test_file='/home/wujindou/dataset/test/food_100_category.csv',
     #test_file='/home/wujindou/dataset/test/food_100_category.csv',
     # test_file='/home/wujindou/.jupyter/test_single.csv',
     #test_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv',
     #test_file='/home/wujindou/dataset/0905/test_baihuo_category_0905.csv',
     #test_file='/home/wujindou/dataset/0905/test_baihuo_category_0905.csv',
     #test_file='/home/wujindou/dataset/test_product_category_0827.csv',
     use_pos_feature=False,
     prefix='third',
     use_bert=False)
 from data.vocab import Vocab
 do_inference = True  #from data.vocab import Vocab
 vocab = Vocab(lower=True, prefix='third_level_baihuo_')
 if not do_inference:
     for word in brc_data.word_iter(None):
         vocab.add(word)
         for char in word:
             vocab.add_char(char)
     logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
     logger.info(' vocab size {} '.format(vocab.get_word_vocab()))
     #
     unfiltered_vocab_size = vocab.size()
     unfiltered_char_size = vocab.get_char_vocab_size()
     vocab.filter_tokens_by_cnt(min_cnt=2)
     do_inference: vocab.filter_chars_by_cnt(min_cnt=2)
     filtered_num = unfiltered_vocab_size - vocab.size()
     logger.info(
         'After filter {} tokens, the final vocab size is {}'.format(
Beispiel #17
0
def train(FLAGS):
    """
    Train a previous or new model.
    """
    # Data paths
    vocab_path = os.path.join(basedir, 'data/vocab.txt')
    train_data_path = os.path.join(basedir, 'data/train.p')
    validation_data_path = os.path.join(basedir, 'data/validation.p')
    vocab = Vocab(vocab_path)
    #FLAGS.num_classes = 5

    # Load embeddings (if using GloVe)
    if FLAGS.embedding == 'glove':
        with open(os.path.join(basedir, 'data/embeddings.p'), 'rb') as f:
            embeddings = pickle.load(f)
        FLAGS.vocab_size = len(embeddings)
    embeddings = np.zeros((len(vocab), FLAGS.emb_size))
    FLAGS.vocab_size = len(vocab)

    with tf.Session() as sess:

        # Create|reload model
        imdb_model = create_model(sess, FLAGS, len(vocab))

        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(
            os.path.join("log", time.strftime("%Y-%m-%d-%H-%M-%S")),
            sess.graph)
        #tf.initialize_all_variables().run()

        # Store attention score history for few samples
        #content = {"review": None, "label": None, "review_len": None, "attn_scores": None}
        attn_history_word = {
            "sample_%i" % i: {
                "review": None,
                "label": None,
                "review_len": None,
                "attn_scores": None
            }
            for i in range(FLAGS.batch_size)
        }

        #print(attn_history_word)
        # Start training
        for  train_epoch_num, train_epoch in \
            enumerate(generate_epoch(
                train_data_path, FLAGS.num_epochs, FLAGS.batch_size)):

            logger.info("==> EPOCH: %s ", train_epoch_num)
            train_acc_count = 0
            train_batch_total = 0
            valid_acc_count = 0
            valid_batch_total = 0

            for  train_batch_num,  (total_batch, batch_features, batch_seq_lens) in \
                enumerate(train_epoch):
                #sys.exit()
                batch_reviews, batch_labels = batch_features
                batch_review_lens = batch_seq_lens

                # Display shapes once
                #for v in batch_reviews:
                #    print("TRAIN EPOCH:", train_epoch_num,"LABEL",  batch_labels, ''.join(vc.ids_to_tokens(v,vocab=vocab)))
                if (train_epoch_num == 0 and train_batch_num == 0):
                    logger.info("Reviews: :%s", np.shape(batch_reviews))
                    logger.info("Labels: %s", np.shape(batch_labels))
                    logger.info("Review lens: %s", np.shape(batch_review_lens))

                _, train_logits, train_loss, train_acc,lr, attn_word_scores,attn_cmt_scores, logits,outputs,prob, distance = \
                imdb_model.train(
                        sess=sess,
                        batch_reviews=batch_reviews,
                        batch_labels=batch_labels,
                        batch_review_lens=batch_review_lens,
                        embeddings=embeddings,
                        keep_prob=FLAGS.keep_prob,
                        )
                logger.info(
                    "[TRAIN]: %i/%i|[ACC]: %.3f|[LOSS]: %.3f|[LABELS] : %i| %s|%s",
                    total_batch, train_batch_num, train_acc, train_loss,
                    batch_labels[0], distance, prob)

                train_batch_total += 1
                if train_acc > 0.99:
                    train_acc_count += 1
                if batch_labels[0] == 3:
                    for i in range(FLAGS.batch_size):
                        sample = "sample_%i" % i
                        attn_history_word[sample]["review"] = batch_reviews[i]
                        attn_history_word[sample]["label"] = batch_labels
                        attn_history_word[sample][
                            "review_len"] = batch_review_lens[i]
                        attn_history_word[sample][
                            "attn_scores"] = attn_word_scores[i]
                    attn_history_comment = attn_cmt_scores

            for valid_epoch_num, valid_epoch in \
                enumerate(generate_epoch(
                    data_path=validation_data_path,
                    num_epochs=1,
                    batch_size=FLAGS.batch_size,
                    )):

                for  valid_batch_num, (total_batch, valid_batch_features, valid_batch_seq_lens) in \
                    enumerate(valid_epoch):

                    valid_batch_reviews, valid_batch_labels = valid_batch_features
                    valid_batch_review_lens = valid_batch_seq_lens

                    #for v in valid_batch_reviews:
                    #    print("VALID EPOCH:", train_epoch_num,"LABEL",  valid_batch_labels, ''.join(vc.ids_to_tokens(v,vocab=vocab)))

                    valid_logits, valid_loss, valid_acc, prob = imdb_model.eval(
                        sess=sess,
                        batch_reviews=valid_batch_reviews,
                        batch_labels=valid_batch_labels,
                        batch_review_lens=valid_batch_review_lens,
                        embeddings=embeddings,
                        keep_prob=1.0,  # no dropout for val|test
                    )
                    logger.info(
                        "[VALID]: %i| [ACC]: %.3f | [LOSS]: %.6f,| [LABELS] : %i |%s",
                        valid_batch_num, valid_acc, valid_loss,
                        valid_batch_labels[0], prob)
                    valid_batch_total += 1
                    if valid_acc > 0.99:
                        valid_acc_count += 1

            logger.info ("[EPOCH]: %i, [LR]: %.6e, [TRAIN ACC]: %.3f, [VALID ACC]: %.3f " \
                    "[TRAIN LOSS]: %.6f, [VALID LOSS]: %.6f " ,
                    train_epoch_num, lr, train_acc_count / train_batch_total, valid_acc_count/valid_batch_total, train_loss, valid_loss)

            # Save the model (maybe)
            if ((train_epoch_num == (FLAGS.num_epochs - 1))
                    or ((train_epoch_num % FLAGS.save_every == 0) and
                        (train_epoch_num > 0))):

                # Make parents ckpt dir if it does not exist
                if not os.path.isdir(
                        os.path.join(basedir, FLAGS.data_dir, 'ckpt')):
                    os.makedirs(os.path.join(basedir, FLAGS.data_dir, 'ckpt'))

                # Make child ckpt dir for this specific model
                if not os.path.isdir(os.path.join(basedir, FLAGS.ckpt_dir)):
                    os.makedirs(os.path.join(basedir, FLAGS.ckpt_dir))

                checkpoint_path = \
                    os.path.join(
                        basedir, FLAGS.ckpt_dir, "%s.ckpt" % FLAGS.model_name)

                logger.info("==> Saving the model.")
                imdb_model.saver.save(sess,
                                      checkpoint_path,
                                      global_step=imdb_model.global_step)

                attn_word_file = os.path.join(basedir, FLAGS.ckpt_dir,
                                              'attn_word_history.p')
                with open(attn_word_file, 'wb') as f:
                    pickle.dump(attn_history_word, f)

                attn_comment_file = os.path.join(basedir, FLAGS.ckpt_dir,
                                                 'attn_cmt_history.p')
                with open(attn_comment_file, 'wb') as f:
                    pickle.dump(attn_history_comment, f)
class ShakespeareModern(Dataset):
    def __init__(self,
                 train_domain_A_path,
                 test_domain_A_path,
                 train_domain_B_path,
                 test_domain_B_path,
                 name='ShakespeareModern',
                 mode='train'):
        self.train_domain_A_path = train_domain_A_path
        self.test_domain_A_path = test_domain_A_path

        self.train_domain_B_path = train_domain_B_path
        self.test_domain_B_path = test_domain_B_path

        self.vocab = Vocab(name)
        self.mode = mode

        self.domain_A_max_len = 0
        self.domain_B_max_len = 0

        self.train_domain_A_data = self.load_and_preprocess_data(
            self.train_domain_A_path, domain='A')
        self.test_domain_A_data = self.load_and_preprocess_data(
            self.test_domain_A_path, domain='A')

        self.train_domain_B_data = self.load_and_preprocess_data(
            self.train_domain_B_path, domain='B')
        self.test_domain_B_data = self.load_and_preprocess_data(
            self.test_domain_B_path, domain='B')

        # self.max_len = 0

    def load_and_preprocess_data(self, path, domain):
        with open(path) as f:
            data = f.readlines()

        for idx, sentence in enumerate(data):
            sentence = normalize_string(sentence)
            self.vocab.add_sentence(sentence, domain)
            data[idx] = get_idx_sentence(self.vocab, sentence)

        max_len = 0
        for sentence in data:
            max_len = max(max_len, len(sentence))

        if (domain == 'A'):
            self.domain_A_max_len = max(self.domain_A_max_len, max_len)
        else:
            self.domain_B_max_len = max(self.domain_B_max_len, max_len)

        self.max_len = max(self.domain_A_max_len, self.domain_B_max_len)

        # padded_sequences = np.ndarray((self.max_len, len(data), 1))
        sentence_tensors = []
        for idx, sentence in enumerate(data):
            sentence_tensors.append(
                torch.Tensor(sentence).type(torch.LongTensor))

        return sentence_tensors  #torch.from_numpy(padded_sequences.astype(np.int64))

    def get_addn_feats(self, sentence):
        net_score = 0
        domain_A_count = 0
        domain_B_count = 0
        sent_len = 0
        for word in sentence:
            word = word.item()
            if not word in self.vocab.tokens:
                sent_len += 1
                word = self.vocab.idx2wrd[word]
                if word in self.vocab.domain_A_vocab and word in self.vocab.domain_B_vocab:
                    net_score += self.vocab.domain_A_vocab[
                        word] - self.vocab.domain_B_vocab[word]
                elif word in self.vocab.domain_A_vocab:
                    net_score += self.vocab.domain_A_vocab[word]
                    domain_A_count += 1
                elif word in self.vocab.domain_B_vocab:
                    net_score -= self.vocab.domain_B_vocab[word]
                    domain_B_count += 1

        return torch.Tensor([net_score, domain_A_count, domain_B_count
                             ]) / sent_len

    def __getitem__(self, index):
        if self.mode == 'test':
            return self.test_domain_A_data[index], self.get_addn_feats(
                self.test_domain_A_data[index]
            ), self.test_domain_B_data[index], self.get_addn_feats(
                self.test_domain_B_data[index])
        else:
            return self.train_domain_A_data[index], self.get_addn_feats(
                self.train_domain_A_data[index]
            ), self.train_domain_B_data[index], self.get_addn_feats(
                self.train_domain_B_data[index])

    def __len__(self):
        if self.mode == 'test':
            return max(len(self.test_domain_A_data),
                       len(self.test_domain_B_data))
        else:
            return max(len(self.train_domain_A_data),
                       len(self.train_domain_B_data))


# train_domain_A_path = '../dataset/train.original.nltktok'
# test_domain_A_path = '../dataset/test.original.nltktok'
# train_domain_B_path = '../dataset/train.modern.nltktok'
# test_domain_B_path = '../dataset/test.modern.nltktok'
# sm = ShakespeareModern(train_domain_A_path, test_domain_A_path, train_domain_B_path, test_domain_B_path)
Beispiel #19
0
                for token in sample['tokens']:
                    yield token


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")
    logger.setLevel(logging.INFO)
    brc_data = DatasetReader(
        '/Users/apple/Downloads/news_qa/news_data_0827/news_data_0827_1w.csv')
    sys.exit(1)
    from data.vocab import Vocab

    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    # vocab.filter_tokens_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    brc_data.convert_to_ids(vocab)
    train_batches = brc_data.gen_mini_batches('train', batch_size=16)
    for batch in train_batches:
        print(batch['in'])

        sys.exit(1)
Beispiel #20
0
def main(args):
    print("Main is running!")

    squad_path = os.path.join(os.getcwd(), 'data', 'squad', args.version)
    dev_eval_dict_path_from_tokens = os.path.join(
        squad_path, 'dev_eval_dict_from_tokens.json')
    if 'dev_eval_dict_from_tokens.json' not in os.listdir(squad_path):
        print("Generating valuation dictionary... ", end="")
        make_eval_dict_tokens(args.dev_data_filepath,
                              dev_eval_dict_path_from_tokens)
        print("Done")

    # set device
    if args.use_gpu:
        device_id = args.device_id
        device = torch.device("cuda:{}".format(args.device_id) if torch.cuda.
                              is_available() else "cpu")
    else:
        device = torch.device("cpu")

    n_gpu = torch.cuda.device_count()
    if torch.cuda.is_available():
        print("device is cuda, # cuda is: ", n_gpu)
    else:
        print("device is cpu")

    # Dataset
    train_json = load_json(args.train_data_filepath)
    eval_json = load_json(args.dev_data_filepath)
    train_data = pd.DataFrame(parse_data(train_json))
    eval_data = pd.DataFrame(parse_data(eval_json))
    header = list(train_data.columns)

    torch.manual_seed(12)
    common_vocab = Vocab(args.language, args.common_embeddings_filepath,
                         args.emb_size)
    vocab = Vocab(args.language,
                  args.word_embeddings_filepath,
                  args.emb_size,
                  base=common_vocab)
    train_dataloader = DataLoader(MultilingualDataset(train_data, vocab),
                                  shuffle=True,
                                  batch_size=args.batch_size,
                                  collate_fn=generate_batch)
    val_dataloader = DataLoader(MultilingualDataset(eval_data, vocab),
                                shuffle=True,
                                batch_size=args.batch_size,
                                collate_fn=generate_batch)

    # get model
    model = QANet(device, args.emb_size, args.d_model, args.context_limit,
                  args.question_limit, args.p_dropout)

    # exponential moving average
    ema = EMA(args.decay)
    if args.use_ema:
        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.register(name, param.data)

    model = model.to(device)

    # optimizer & scheduler
    lr = args.lr
    base_lr = 1.0
    warm_up = args.lr_warm_up_num
    params = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(lr=base_lr,
                                 betas=(args.beta1, args.beta2),
                                 eps=1e-7,
                                 weight_decay=3e-7,
                                 params=params)
    cr = lr / math.log(warm_up)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warm_up else lr)

    # set loss
    criterion = nn.NLLLoss(reduction='mean')  # LogSoftmax applied in Pointer

    # checkpoint identifier
    identifier = type(model).__name__ + '_'

    # training and evaluation
    trainer = Trainer(args, device, model, optimizer, scheduler, criterion,
                      train_dataloader, val_dataloader, ema,
                      dev_eval_dict_path_from_tokens, identifier)
    trainer.train()
Beispiel #21
0
    parser.add_argument("--token_type", type=str, default="char", choices=["char", "token"])
    parser.add_argument("--src_vocab_path", type=str, required=True, help="白话文词表路径")
    parser.add_argument("--trg_vocab_path", type=str, required=True, help="文言文词表路径")

    parser = ModelInterface.add_trainer_args(parser)

    args = parser.parse_args()

    if args.token_type == "char":
        src_tokenizer = CharTokenizer()
    elif args.token_type == "token":
        src_tokenizer = VernacularTokenTokenizer()

    src_tokenizer.load_vocab(args.src_vocab_path)

    trg_vocab = Vocab()
    trg_vocab.load(args.trg_vocab_path)

    model = ModelInterface.load_from_checkpoint(
        args.checkpoint_path,
        src_vocab=src_tokenizer.vocab,
        trg_vocab=trg_vocab,
    )

    model = model.eval()
    while True:
        sent = input("原始白话文:")

        input_token_list = src_tokenizer.tokenize(sent, map_to_id=True)
        res_sent = model.inference(
            torch.LongTensor([input_token_list]),
if __name__ == '__main__':
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")
    logger.setLevel(logging.INFO)
    # brc_data = DatasetReader(train_file='./dataset/seg_train_data_20w.txt',
    #                          dev_file='./dataset/seg_dev_data_20w.txt',
    #                          # test_file ='./dataset/test_data'
    #                          )
    brc_data = DatasetReader(train_file='../dataset/train_yes_no_8k.txt',
                             dev_file='../dataset/dev_yes_no_8k.txt')
    from data.vocab import Vocab

    vocab = Vocab(lower=True)
    import sys

    for word in brc_data.word_iter(None):
        vocab.add(word)
        for char in word:
            vocab.add_char(char)
    logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
    logger.info(' vocab size {} '.format(vocab.get_word_vocab()))

    unfiltered_vocab_size = vocab.size()
    unfiltered_char_size = vocab.get_char_vocab_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab.filter_chars_by_cnt(min_cnt=2)
    brc_data.convert_to_ids(vocab)