def setup(self, stage: Optional[str] = None): self.src_vocab = Vocab() self.src_vocab.load(str(self.data_dir / "src_vocab.json")) self.src_vocab_size = len(self.src_vocab) self.trg_vocab = Vocab() self.trg_vocab.load(str(self.data_dir / "trg_vocab.json")) self.trg_vocab_size = len(self.trg_vocab) self.train_dataset = AncientPairDataset( str(self.data_dir / "train.tsv"), 128, self.src_vocab, self.trg_vocab, ) self.valid_dataset = AncientPairDataset( str(self.data_dir / "valid.tsv"), 128, self.src_vocab, self.trg_vocab, ) self.test_dataset = AncientPairDataset( str(self.data_dir / "test.tsv"), 128, self.src_vocab, self.trg_vocab, ) logger.info( f"数据集信息:\n\t" f"训练集: {len(self.train_dataset)}, " f"验证集: {len(self.valid_dataset)}, " f"测试集: {len(self.test_dataset)}", )
def __init__(self, train_domain_A_path, test_domain_A_path, train_domain_B_path, test_domain_B_path, name='ShakespeareModern', mode='train'): self.train_domain_A_path = train_domain_A_path self.test_domain_A_path = test_domain_A_path self.train_domain_B_path = train_domain_B_path self.test_domain_B_path = test_domain_B_path self.vocab = Vocab(name) self.mode = mode self.domain_A_max_len = 0 self.domain_B_max_len = 0 self.train_domain_A_data = self.load_and_preprocess_data( self.train_domain_A_path, domain='A') self.test_domain_A_data = self.load_and_preprocess_data( self.test_domain_A_path, domain='A') self.train_domain_B_data = self.load_and_preprocess_data( self.train_domain_B_path, domain='B') self.test_domain_B_data = self.load_and_preprocess_data( self.test_domain_B_path, domain='B')
def build_vocab(self, vocab_size, min_freq, specials): counter = Counter() for t in self.dataset: tokens = self.tokenize(t) counter.update(tokens) vocab = Vocab.from_counter(counter=counter, vocab_size=vocab_size, min_freq=min_freq, specials=specials) return vocab
def load_word_vectors(embeddings_path): if os.path.isfile(embeddings_path + '.pth') and \ os.path.isfile(embeddings_path + '.vocab'): print('==> File found, loading to memory') vectors = torch.load(embeddings_path + '.pth') vocab = Vocab(filename=embeddings_path + '.vocab') return vocab, vectors if os.path.isfile(embeddings_path + '.model'): model = KeyedVectors.load(embeddings_path + ".model") if os.path.isfile(embeddings_path + '.vec'): model = FastText.load_word2vec_format(embeddings_path + '.vec') list_of_tokens = model.vocab.keys() vectors = torch.zeros(len(list_of_tokens), model.vector_size) with open(embeddings_path + '.vocab', 'w', encoding='utf-8') as f: for token in list_of_tokens: f.write(token+'\n') vocab = Vocab(filename=embeddings_path + '.vocab') for index, word in enumerate(list_of_tokens): vectors[index, :] = torch.from_numpy(model[word]) return vocab, vectors
def create_full_dataset(args): train_dir = 'training-treebank' vocab_file = 'tmp/vocab.txt' build_vocab( [ 'training-treebank/rev_sentence.txt', 'training-treebank/sklad_sentence.txt', 'test/polevaltest_sentence.txt', args.emb_dir + args.emb_file + '.vec' #full vocabulary in model ], 'tmp/vocab.txt') vocab = Vocab(filename=vocab_file) full_dataset = SSTDataset(train_dir, vocab, args.num_classes) return vocab, full_dataset
def get_vocab(args): vocab = Vocab() if args.model in ["bert", "mmbt", "concatbert"]: bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) vocab.stoi = bert_tokenizer.vocab vocab.itos = bert_tokenizer.ids_to_tokens vocab.vocab_sz = len(vocab.itos) else: word_list = get_glove_words(args.glove_path) vocab.add(word_list) return vocab
def get_attn_inputs(FLAGS, review, review_len, raw_attn_scores): """ Return the inputs needed to plot the attn scores. These include input_sentence and attn_scores. Args: FLAGS: parameters review: list of ids review_len: len of the relevant review Return: input_sentence: inputs as tokens (words) on len <review_len> plot_attn_scoes: (1, review_len) shaped scores """ # Data paths vocab_path = os.path.join(basedir, '../data/vocab.txt') vocab = Vocab(vocab_path) review = review[:review_len] attn_scores = raw_attn_scores[:review_len] attn_scores = raw_attn_scores # Process input_sentence _input_sentence = vc.ids_to_tokens(review, vocab) _input_sentence += ['.'] * (max_length - len(_input_sentence)) input_sentence = ''.join(item for item in _input_sentence) print("plot ...........", input_sentence) print("plot ...........", attn_scores) # Process attn scores (normalize scores between [0,1]) min_attn_score = min(attn_scores) max_attn_score = max(attn_scores) normalized_attn_scores = ((attn_scores - min_attn_score) / \ (max_attn_score - min_attn_score)) # Reshape attn scores for plotting plot_attn_scores = np.zeros((1, max_length)) for i, score in enumerate(normalized_attn_scores): plot_attn_scores[0, i] = score #print(plot_attn_scores) return input_sentence, plot_attn_scores
def infer(FLAGS): scores = collections.defaultdict(list) """ Infer a previous or new model. """ # Data paths vocab_path = os.path.join(basedir, 'data/vocab.txt') infer_data_path = os.path.join(basedir, 'data/infer.p') vocab = Vocab(vocab_path) # Load embeddings (if using GloVe) embeddings = np.zeros((len(vocab), FLAGS.emb_size)) FLAGS.vocab_size = len(vocab) with tf.Session() as sess: # Create|reload model imdb_model = train.create_model(sess, FLAGS, len(vocab)) for infer_index, data in \ enumerate(infer_data( infer_data_path, FLAGS.batch_size)): comments, skuid = data[0] review_lens = data[1] logits, prob, label = imdb_model.infer( sess=sess, batch_reviews=comments, batch_review_lens=review_lens, embeddings=embeddings, keep_prob=1.0, # no dropout for val|test ) logger.info("[INFER]: [SKUID] : %s | %s | %s", skuid, label, prob) scores[skuid[0]].append(label[0]) for k, v in scores.items(): counts = Counter(v) db.update_scores(k, int(counts.most_common(1)[0][0]) + 5) logger.info("[INFER]: [SKUID] : %s | %s |%s ", k, v, counts.most_common(1))
def sample_data(data_path): """ Sample format of the processed data from data.py Args: data_path: path for train.p|valid.p """ with open(data_path, 'rb') as f: entries = pickle.load(f) vocab_file = os.path.join(basedir, 'data/vocab.txt') vocab = Vocab(vocab_file, verbose=False) for k, v in entries.items(): rand_index = random.randint(0, len(v[0])) print("==> Processed Review:", v[0][rand_index]) print("==> Review Len:", v[1][rand_index]) print("==> Label:", k) print("==> See if processed review makes sense:", vc.ids_to_tokens( v[0][rand_index], vocab=vocab, ))
def build_dataset(data_path, config, is_train, vocab=None, load_vocab=None): args = config.data if is_train: src_txt, tgt_txt = load_dataset(data_path) src_train = TextDataset(src_txt, args.src_max_train) tgt_train = TextDataset(tgt_txt, args.tgt_max_train) if load_vocab is not None: vocab = Vocab.from_json(load_vocab) else: vocab = src_train.build_vocab( vocab_size=args.vocab_size, min_freq=args.vocab_min_freq, specials=[PAD_TOKEN, UNK_TOKEN, START_DECODING, STOP_DECODING]) dataset = SummDataset(src=src_train, tgt=tgt_train, vocab=vocab) return dataset, vocab else: assert vocab is not None src_txt, tgt_txt = load_dataset(data_path) src_test = TextDataset(src_txt, args.src_max_test) tgt_test = TextDataset(tgt_txt, args.tgt_max_test) dataset = SummDataset(src=src_test, tgt=tgt_test, vocab=vocab) return dataset
def infer(FLAGS): """ Infer a previous or new model. """ # Data paths vocab_path = os.path.join(basedir, 'data/vocab.txt') validation_data_path = os.path.join(basedir, 'data/infer.p') vocab = Vocab(vocab_path) # Load embeddings (if using GloVe) embeddings = np.zeros((len(vocab), FLAGS.emb_size)) FLAGS.vocab_size = len(vocab) with tf.Session() as sess: # Create|reload model imdb_model = create_model(sess, FLAGS, len(vocab)) for infer_index, infer_data in \ enumerate(infer_data( infer_data_path, FLAGS.batch_size)): comments, skuid = valid_batch_features review_lens = valid_batch_seq_lens valid_logits, valid_loss, valid_acc, prob = imdb_model.infer( sess=sess, batch_reviews=valid_batch_reviews, batch_labels=valid_batch_labels, batch_review_lens=valid_batch_review_lens, embeddings=embeddings, keep_prob=1.0, # no dropout for val|test ) logger.info( "[VALID]: %i| [ACC]: %.3f | [LOSS]: %.6f,| [LABELS] : %i |%s", valid_batch_num, valid_acc, valid_loss, valid_batch_labels[0], prob)
class AncientPairDataModule(pl.LightningDataModule): def __init__(self, batch_size: int, data_dir: str, workers: int): super().__init__() self.data_dir = Path(data_dir) self.batch_size = batch_size self.workers = workers if not self.data_dir.exists(): raise ValueError("Directory or file doesn't exist") if not self.data_dir.is_dir(): raise ValueError("`data_dir` must be a path to directory") @classmethod def add_data_args(cls, parent_parser: argparse.ArgumentParser): parser = parent_parser.add_argument_group("data") parser.add_argument("--data_dir", type=str, default="./data", help="数据存储路径") parser.add_argument("--batch_size", type=int, default=128, help="一个batch的大小") parser.add_argument("--workers", type=int, default=0, help="读取dataset的worker数") cls.parser = parser return parent_parser def prepare_data(self): """数据已提前准备完成""" def setup(self, stage: Optional[str] = None): self.src_vocab = Vocab() self.src_vocab.load(str(self.data_dir / "src_vocab.json")) self.src_vocab_size = len(self.src_vocab) self.trg_vocab = Vocab() self.trg_vocab.load(str(self.data_dir / "trg_vocab.json")) self.trg_vocab_size = len(self.trg_vocab) self.train_dataset = AncientPairDataset( str(self.data_dir / "train.tsv"), 128, self.src_vocab, self.trg_vocab, ) self.valid_dataset = AncientPairDataset( str(self.data_dir / "valid.tsv"), 128, self.src_vocab, self.trg_vocab, ) self.test_dataset = AncientPairDataset( str(self.data_dir / "test.tsv"), 128, self.src_vocab, self.trg_vocab, ) logger.info( f"数据集信息:\n\t" f"训练集: {len(self.train_dataset)}, " f"验证集: {len(self.valid_dataset)}, " f"测试集: {len(self.test_dataset)}", ) def train_dataloader(self): return DataLoader( self.train_dataset, batch_size=self.batch_size, num_workers=self.workers, ) def val_dataloader(self): return DataLoader( self.valid_dataset, batch_size=self.batch_size, num_workers=self.workers, ) def test_dataloader(self): return DataLoader( self.test_dataset, batch_size=self.batch_size, num_workers=self.workers, )
__name__, template_folder="templates", static_folder="./", static_url_path="", ) if "MODEL_DIR" not in os.environ: print("MODEL_DIR must be speicified before launching server") exit(1) model_dir = os.environ["MODEL_DIR"] src_tokenizer = CharTokenizer() src_tokenizer.load_vocab(os.path.join(model_dir, "src_vocab.json")) trg_vocab = Vocab() trg_vocab.load(os.path.join(model_dir, "trg_vocab.json")) model = ModelInterface.load_from_checkpoint( os.path.join(model_dir, "checkpoint.pt"), src_vocab=src_tokenizer.vocab, trg_vocab=trg_vocab, model_name="transformer", ).to("cuda" if torch.cuda.is_available() else "cpu") model = model.eval() @app.route("/", methods=["GET"]) def index(): return render_template("index.html")
parser.add_argument('--output', help='output dir file') args = parser.parse_args() logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") logger.setLevel(logging.INFO) brc_data = DatasetReader( test_file=args.input, bert_dir='/home/wujindou/chinese_L-12_H-768_A-12', # prefix='bert_meizhuang' #test_file = None, ) from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) # unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format(
# Brief: ##################################################### from flask import Flask, request, jsonify from flask import render_template from tensorflow.python.keras.backend import set_session import requests import sys sys.path.append('../') import os app = Flask(__name__) from data.vocab import Vocab os.environ["CUDA_VISIBLE_DEVICES"] = " " vocab_file = '../examples/politic_vocab5.txt' # vocab.load_from_file('vocab_bool.txt') vocab = Vocab(lower=True) from data.data_reader_new import DatasetReader from model.text_cnn import TextCNN if os.path.exists(vocab_file): vocab.load_from_file(vocab_file) print(vocab.get_word_vocab()) @app.route('/') def search_index(): return render_template('index.html') model = TextCNN(vocab, num_class=2, pretrained_word_embedding=vocab.embeddings, word_embedding_size=300)
#brc_data = DatasetReader(train_file='/home/wujindou/dataset/0905/train_baihuo_category_0905.csv', dev_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv', test_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv', #test_file='/home/wujindou/dataset/test/food_100_category.csv', #test_file='/home/wujindou/dataset/test/food_100_category.csv', # test_file='/home/wujindou/.jupyter/test_single.csv', #test_file='/home/wujindou/dataset/0908/baihuoshipin/dev_third.csv', #test_file='/home/wujindou/dataset/0905/test_baihuo_category_0905.csv', #test_file='/home/wujindou/dataset/0905/test_baihuo_category_0905.csv', #test_file='/home/wujindou/dataset/test_product_category_0827.csv', use_pos_feature=False, prefix='third', use_bert=False) from data.vocab import Vocab do_inference = True #from data.vocab import Vocab vocab = Vocab(lower=True, prefix='third_level_baihuo_') if not do_inference: for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) # unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) do_inference: vocab.filter_chars_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info( 'After filter {} tokens, the final vocab size is {}'.format(
def train(FLAGS): """ Train a previous or new model. """ # Data paths vocab_path = os.path.join(basedir, 'data/vocab.txt') train_data_path = os.path.join(basedir, 'data/train.p') validation_data_path = os.path.join(basedir, 'data/validation.p') vocab = Vocab(vocab_path) #FLAGS.num_classes = 5 # Load embeddings (if using GloVe) if FLAGS.embedding == 'glove': with open(os.path.join(basedir, 'data/embeddings.p'), 'rb') as f: embeddings = pickle.load(f) FLAGS.vocab_size = len(embeddings) embeddings = np.zeros((len(vocab), FLAGS.emb_size)) FLAGS.vocab_size = len(vocab) with tf.Session() as sess: # Create|reload model imdb_model = create_model(sess, FLAGS, len(vocab)) summaries = tf.summary.merge_all() writer = tf.summary.FileWriter( os.path.join("log", time.strftime("%Y-%m-%d-%H-%M-%S")), sess.graph) #tf.initialize_all_variables().run() # Store attention score history for few samples #content = {"review": None, "label": None, "review_len": None, "attn_scores": None} attn_history_word = { "sample_%i" % i: { "review": None, "label": None, "review_len": None, "attn_scores": None } for i in range(FLAGS.batch_size) } #print(attn_history_word) # Start training for train_epoch_num, train_epoch in \ enumerate(generate_epoch( train_data_path, FLAGS.num_epochs, FLAGS.batch_size)): logger.info("==> EPOCH: %s ", train_epoch_num) train_acc_count = 0 train_batch_total = 0 valid_acc_count = 0 valid_batch_total = 0 for train_batch_num, (total_batch, batch_features, batch_seq_lens) in \ enumerate(train_epoch): #sys.exit() batch_reviews, batch_labels = batch_features batch_review_lens = batch_seq_lens # Display shapes once #for v in batch_reviews: # print("TRAIN EPOCH:", train_epoch_num,"LABEL", batch_labels, ''.join(vc.ids_to_tokens(v,vocab=vocab))) if (train_epoch_num == 0 and train_batch_num == 0): logger.info("Reviews: :%s", np.shape(batch_reviews)) logger.info("Labels: %s", np.shape(batch_labels)) logger.info("Review lens: %s", np.shape(batch_review_lens)) _, train_logits, train_loss, train_acc,lr, attn_word_scores,attn_cmt_scores, logits,outputs,prob, distance = \ imdb_model.train( sess=sess, batch_reviews=batch_reviews, batch_labels=batch_labels, batch_review_lens=batch_review_lens, embeddings=embeddings, keep_prob=FLAGS.keep_prob, ) logger.info( "[TRAIN]: %i/%i|[ACC]: %.3f|[LOSS]: %.3f|[LABELS] : %i| %s|%s", total_batch, train_batch_num, train_acc, train_loss, batch_labels[0], distance, prob) train_batch_total += 1 if train_acc > 0.99: train_acc_count += 1 if batch_labels[0] == 3: for i in range(FLAGS.batch_size): sample = "sample_%i" % i attn_history_word[sample]["review"] = batch_reviews[i] attn_history_word[sample]["label"] = batch_labels attn_history_word[sample][ "review_len"] = batch_review_lens[i] attn_history_word[sample][ "attn_scores"] = attn_word_scores[i] attn_history_comment = attn_cmt_scores for valid_epoch_num, valid_epoch in \ enumerate(generate_epoch( data_path=validation_data_path, num_epochs=1, batch_size=FLAGS.batch_size, )): for valid_batch_num, (total_batch, valid_batch_features, valid_batch_seq_lens) in \ enumerate(valid_epoch): valid_batch_reviews, valid_batch_labels = valid_batch_features valid_batch_review_lens = valid_batch_seq_lens #for v in valid_batch_reviews: # print("VALID EPOCH:", train_epoch_num,"LABEL", valid_batch_labels, ''.join(vc.ids_to_tokens(v,vocab=vocab))) valid_logits, valid_loss, valid_acc, prob = imdb_model.eval( sess=sess, batch_reviews=valid_batch_reviews, batch_labels=valid_batch_labels, batch_review_lens=valid_batch_review_lens, embeddings=embeddings, keep_prob=1.0, # no dropout for val|test ) logger.info( "[VALID]: %i| [ACC]: %.3f | [LOSS]: %.6f,| [LABELS] : %i |%s", valid_batch_num, valid_acc, valid_loss, valid_batch_labels[0], prob) valid_batch_total += 1 if valid_acc > 0.99: valid_acc_count += 1 logger.info ("[EPOCH]: %i, [LR]: %.6e, [TRAIN ACC]: %.3f, [VALID ACC]: %.3f " \ "[TRAIN LOSS]: %.6f, [VALID LOSS]: %.6f " , train_epoch_num, lr, train_acc_count / train_batch_total, valid_acc_count/valid_batch_total, train_loss, valid_loss) # Save the model (maybe) if ((train_epoch_num == (FLAGS.num_epochs - 1)) or ((train_epoch_num % FLAGS.save_every == 0) and (train_epoch_num > 0))): # Make parents ckpt dir if it does not exist if not os.path.isdir( os.path.join(basedir, FLAGS.data_dir, 'ckpt')): os.makedirs(os.path.join(basedir, FLAGS.data_dir, 'ckpt')) # Make child ckpt dir for this specific model if not os.path.isdir(os.path.join(basedir, FLAGS.ckpt_dir)): os.makedirs(os.path.join(basedir, FLAGS.ckpt_dir)) checkpoint_path = \ os.path.join( basedir, FLAGS.ckpt_dir, "%s.ckpt" % FLAGS.model_name) logger.info("==> Saving the model.") imdb_model.saver.save(sess, checkpoint_path, global_step=imdb_model.global_step) attn_word_file = os.path.join(basedir, FLAGS.ckpt_dir, 'attn_word_history.p') with open(attn_word_file, 'wb') as f: pickle.dump(attn_history_word, f) attn_comment_file = os.path.join(basedir, FLAGS.ckpt_dir, 'attn_cmt_history.p') with open(attn_comment_file, 'wb') as f: pickle.dump(attn_history_comment, f)
class ShakespeareModern(Dataset): def __init__(self, train_domain_A_path, test_domain_A_path, train_domain_B_path, test_domain_B_path, name='ShakespeareModern', mode='train'): self.train_domain_A_path = train_domain_A_path self.test_domain_A_path = test_domain_A_path self.train_domain_B_path = train_domain_B_path self.test_domain_B_path = test_domain_B_path self.vocab = Vocab(name) self.mode = mode self.domain_A_max_len = 0 self.domain_B_max_len = 0 self.train_domain_A_data = self.load_and_preprocess_data( self.train_domain_A_path, domain='A') self.test_domain_A_data = self.load_and_preprocess_data( self.test_domain_A_path, domain='A') self.train_domain_B_data = self.load_and_preprocess_data( self.train_domain_B_path, domain='B') self.test_domain_B_data = self.load_and_preprocess_data( self.test_domain_B_path, domain='B') # self.max_len = 0 def load_and_preprocess_data(self, path, domain): with open(path) as f: data = f.readlines() for idx, sentence in enumerate(data): sentence = normalize_string(sentence) self.vocab.add_sentence(sentence, domain) data[idx] = get_idx_sentence(self.vocab, sentence) max_len = 0 for sentence in data: max_len = max(max_len, len(sentence)) if (domain == 'A'): self.domain_A_max_len = max(self.domain_A_max_len, max_len) else: self.domain_B_max_len = max(self.domain_B_max_len, max_len) self.max_len = max(self.domain_A_max_len, self.domain_B_max_len) # padded_sequences = np.ndarray((self.max_len, len(data), 1)) sentence_tensors = [] for idx, sentence in enumerate(data): sentence_tensors.append( torch.Tensor(sentence).type(torch.LongTensor)) return sentence_tensors #torch.from_numpy(padded_sequences.astype(np.int64)) def get_addn_feats(self, sentence): net_score = 0 domain_A_count = 0 domain_B_count = 0 sent_len = 0 for word in sentence: word = word.item() if not word in self.vocab.tokens: sent_len += 1 word = self.vocab.idx2wrd[word] if word in self.vocab.domain_A_vocab and word in self.vocab.domain_B_vocab: net_score += self.vocab.domain_A_vocab[ word] - self.vocab.domain_B_vocab[word] elif word in self.vocab.domain_A_vocab: net_score += self.vocab.domain_A_vocab[word] domain_A_count += 1 elif word in self.vocab.domain_B_vocab: net_score -= self.vocab.domain_B_vocab[word] domain_B_count += 1 return torch.Tensor([net_score, domain_A_count, domain_B_count ]) / sent_len def __getitem__(self, index): if self.mode == 'test': return self.test_domain_A_data[index], self.get_addn_feats( self.test_domain_A_data[index] ), self.test_domain_B_data[index], self.get_addn_feats( self.test_domain_B_data[index]) else: return self.train_domain_A_data[index], self.get_addn_feats( self.train_domain_A_data[index] ), self.train_domain_B_data[index], self.get_addn_feats( self.train_domain_B_data[index]) def __len__(self): if self.mode == 'test': return max(len(self.test_domain_A_data), len(self.test_domain_B_data)) else: return max(len(self.train_domain_A_data), len(self.train_domain_B_data)) # train_domain_A_path = '../dataset/train.original.nltktok' # test_domain_A_path = '../dataset/test.original.nltktok' # train_domain_B_path = '../dataset/train.modern.nltktok' # test_domain_B_path = '../dataset/test.modern.nltktok' # sm = ShakespeareModern(train_domain_A_path, test_domain_A_path, train_domain_B_path, test_domain_B_path)
for token in sample['tokens']: yield token if __name__ == '__main__': logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") logger.setLevel(logging.INFO) brc_data = DatasetReader( '/Users/apple/Downloads/news_qa/news_data_0827/news_data_0827_1w.csv') sys.exit(1) from data.vocab import Vocab vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() # vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) brc_data.convert_to_ids(vocab) train_batches = brc_data.gen_mini_batches('train', batch_size=16) for batch in train_batches: print(batch['in']) sys.exit(1)
def main(args): print("Main is running!") squad_path = os.path.join(os.getcwd(), 'data', 'squad', args.version) dev_eval_dict_path_from_tokens = os.path.join( squad_path, 'dev_eval_dict_from_tokens.json') if 'dev_eval_dict_from_tokens.json' not in os.listdir(squad_path): print("Generating valuation dictionary... ", end="") make_eval_dict_tokens(args.dev_data_filepath, dev_eval_dict_path_from_tokens) print("Done") # set device if args.use_gpu: device_id = args.device_id device = torch.device("cuda:{}".format(args.device_id) if torch.cuda. is_available() else "cpu") else: device = torch.device("cpu") n_gpu = torch.cuda.device_count() if torch.cuda.is_available(): print("device is cuda, # cuda is: ", n_gpu) else: print("device is cpu") # Dataset train_json = load_json(args.train_data_filepath) eval_json = load_json(args.dev_data_filepath) train_data = pd.DataFrame(parse_data(train_json)) eval_data = pd.DataFrame(parse_data(eval_json)) header = list(train_data.columns) torch.manual_seed(12) common_vocab = Vocab(args.language, args.common_embeddings_filepath, args.emb_size) vocab = Vocab(args.language, args.word_embeddings_filepath, args.emb_size, base=common_vocab) train_dataloader = DataLoader(MultilingualDataset(train_data, vocab), shuffle=True, batch_size=args.batch_size, collate_fn=generate_batch) val_dataloader = DataLoader(MultilingualDataset(eval_data, vocab), shuffle=True, batch_size=args.batch_size, collate_fn=generate_batch) # get model model = QANet(device, args.emb_size, args.d_model, args.context_limit, args.question_limit, args.p_dropout) # exponential moving average ema = EMA(args.decay) if args.use_ema: for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) model = model.to(device) # optimizer & scheduler lr = args.lr base_lr = 1.0 warm_up = args.lr_warm_up_num params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(lr=base_lr, betas=(args.beta1, args.beta2), eps=1e-7, weight_decay=3e-7, params=params) cr = lr / math.log(warm_up) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warm_up else lr) # set loss criterion = nn.NLLLoss(reduction='mean') # LogSoftmax applied in Pointer # checkpoint identifier identifier = type(model).__name__ + '_' # training and evaluation trainer = Trainer(args, device, model, optimizer, scheduler, criterion, train_dataloader, val_dataloader, ema, dev_eval_dict_path_from_tokens, identifier) trainer.train()
parser.add_argument("--token_type", type=str, default="char", choices=["char", "token"]) parser.add_argument("--src_vocab_path", type=str, required=True, help="白话文词表路径") parser.add_argument("--trg_vocab_path", type=str, required=True, help="文言文词表路径") parser = ModelInterface.add_trainer_args(parser) args = parser.parse_args() if args.token_type == "char": src_tokenizer = CharTokenizer() elif args.token_type == "token": src_tokenizer = VernacularTokenTokenizer() src_tokenizer.load_vocab(args.src_vocab_path) trg_vocab = Vocab() trg_vocab.load(args.trg_vocab_path) model = ModelInterface.load_from_checkpoint( args.checkpoint_path, src_vocab=src_tokenizer.vocab, trg_vocab=trg_vocab, ) model = model.eval() while True: sent = input("原始白话文:") input_token_list = src_tokenizer.tokenize(sent, map_to_id=True) res_sent = model.inference( torch.LongTensor([input_token_list]),
if __name__ == '__main__': logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") logger.setLevel(logging.INFO) # brc_data = DatasetReader(train_file='./dataset/seg_train_data_20w.txt', # dev_file='./dataset/seg_dev_data_20w.txt', # # test_file ='./dataset/test_data' # ) brc_data = DatasetReader(train_file='../dataset/train_yes_no_8k.txt', dev_file='../dataset/dev_yes_no_8k.txt') from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) brc_data.convert_to_ids(vocab)