def __init__(self, path=None): if path == None: path = 'data/HowNet.txt' self.word2idx = {} self.idx2word = [] self.idx2freq = [] self.idx2senses = [] self.threshold = -1 self.sememe_dict = Dictionary() self.threshold = 0 file = open(path) phase = 0 re_chn = re.compile(u'[^\u4e00-\u9fa5]') cur_word = '' # add sememe for special tokens self.add_word('<unk>', ['<unk>']) self.add_word('<eos>', ['<eos>']) self.add_word('<N>', ['基数']) self.add_word('<year>', ['时间', '年', '特定']) self.add_word('<date>', ['时间', '月', '特定']) self.add_word('<hour>', ['时间', '时', '特定']) self.add_word('(', ['标点']) self.add_word('『', ['标点']) self.add_word('……', ['标点']) self.add_word('●', ['标点']) self.add_word('《', ['标点']) self.add_word('—', ['标点']) self.add_word('———', ['标点']) self.add_word('』', ['标点']) self.add_word('》', ['标点']) self.add_word('△', ['标点']) self.add_word('、', ['标点']) self.add_word(')', ['标点']) self.add_word('℃', ['标点']) self.add_word('▲', ['标点']) for line in file.readlines(): if line[0:3] == 'NO.': phase = 1 continue # new word if phase == 1 and line[0:3] == 'W_C': phase = 2 word = line[4:-1] if word == '': phase = 0 else: cur_word = word continue if phase == 2 and line[0:3] == 'DEF': phase = 3 content = line[4:-1] sememes = re_chn.split(content) sememe_bag = [] for sememe in sememes: if sememe != '': sememe_bag += [sememe] if cur_word != '': self.add_word(cur_word, sememe_bag) self.sememe_dict.idx2freq = [0] * len(self.sememe_dict)
def build_word_dict(args, examples): """Return a word dictionary from question and document words in provided examples. """ word_dict = Dictionary() for w in load_words(args, examples): word_dict.add(w) return word_dict
def build_char_dict(args, examples): """Return a char dictionary from question and document words in provided examples. """ char_dict = Dictionary() for c in load_chars(args, examples): char_dict.add(c) return char_dict
def _insertfull(iterable): for w in iterable: w = Dictionary.normalize(w) for c in w: c = Dictionary.normalize(c) if valid_chars and c not in valid_chars: continue chars.add(c)
def _insert(iterable): for cs in iterable: for c in cs: c = Dictionary.normalize(c) if valid_chars and c not in valid_chars: continue chars.add(c)
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() with open(embedding_file) as f: for line in f: w = Dictionary.normalize(line.rstrip().split(' ')[0]) words.add(w) return words
def index_embedding_chars(char_embedding_file): """Put all the chars in char_embedding_file into a set.""" chars = set() with open(char_embedding_file) as f: for line in f: c = Dictionary.normalize(line.rstrip().split(' ')[0]) chars.add(c) return chars
def top_question_words(args, examples, word_dict): """Count and return the most common question words in provided examples.""" word_count = Counter() for ex in examples: for w in ex['question']: w = Dictionary.normalize(w) if w in word_dict: word_count.update([w]) return word_count.most_common(args.tune_partial)
def opts2params(opts, dictionary: data.Dictionary): """Convert command line options to a dictionary to construct a model""" params = { "rnn_type": opts.rnn_type, "direction": opts.direction, "tok_len": dictionary.tok_len(), "tok_emb": opts.tok_emb, "tok_hid": opts.tok_hid, "char_len": dictionary.char_len(), "char_emb": opts.char_emb, "char_hid": opts.char_hid, "char_kmin": opts.char_kmin, "char_kmax": opts.char_kmax, "wo_char": opts.wo_char, "wo_tok": opts.wo_tok, "nlayers": opts.nlayers, "dropout": opts.dropout, "init_range": opts.init_range, "tied": opts.tied } return params
def __set_corpus(self): pre_dict = Dictionary() for lines in self.text_list: for line in lines: if len(line) > 0: words = line.split() #tokens += len(words) for word in words: pre_dict.add_word(word) pro_dict = Dictionary() for key in pre_dict.count: if (pre_dict.count[key] > 10): pro_dict.add_word(key) self.corpus = pro_dict
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() counter = 0 try: with open(embedding_file, encoding="utf-8") as f: for line in f: counter += 1 w = Dictionary.normalize(line.rstrip().split(' ')[0]) words.add(w) except: print("An exception occurred on the " + counter + " word") return words
def load(filename, new_args=None, normalize=True): logger.info('Loading model %s' % filename) saved_params = torch.load(filename, map_location=lambda storage, loc: storage) word_dict = saved_params['word_dict'] try: char_dict = saved_params['char_dict'] except KeyError as e: char_dict = Dictionary() feature_dict = saved_params['feature_dict'] state_dict = saved_params['state_dict'] args = saved_params['args'] if new_args: args = override_model_args(args, new_args) return DocReader(args, word_dict, char_dict, feature_dict, state_dict, normalize)
def init_vocab(self, path, language): """ Initialize an instance of Dictionary, which create (or retrieve) the dictionary associated with the data used. """ self.vocab = Dictionary(path, language)
def forward(self, data): embed = map(self.embed, data) if self.encoder == 'BOTH': c_post, c_cmnt, c_neg = map(self.cnn, embed) r_post, r_cmnt, r_neg = map(self.rnn, embed) post_enc = torch.cat((c_post, r_post), 1) cmnt_enc = torch.cat((c_cmnt, r_cmnt), 1) neg_enc = torch.cat((c_neg, r_neg), 1) else: post_enc, cmnt_enc, neg_enc = map(self.encoder, embed) return map(normalize, (post_enc, cmnt_enc, neg_enc)) if __name__ == '__main__': dic = Dictionary('./full_dataset/train.vocab') batch_size = 10 seq_len = 30 cuda = False data_iter = DataIter(corpus_path='./full_dataset/tmp.txt', batch_size=batch_size, seq_len=seq_len, dictionary=dic, cuda=cuda) ntokens = len(dic) enc = DSSM(ntokens=ntokens, nemb=300, sent_len=seq_len, dropout=0.5, pre_embed=None, encoder='CNN',
def main(): parser = argparse.ArgumentParser( description='Train a neural machine translation model') # Training corpus corpora_group = parser.add_argument_group( 'training corpora', 'Corpora related arguments; specify either monolingual or parallel training corpora (or both)' ) corpora_group.add_argument('--src_path', help='the source language monolingual corpus') corpora_group.add_argument('--trg_path', help='the target language monolingual corpus') corpora_group.add_argument( '--max_sentence_length', type=int, default=90, help='the maximum sentence length for training (defaults to 50)') # Embeddings/vocabulary embedding_group = parser.add_argument_group( 'embeddings', 'Embedding related arguments; either give pre-trained cross-lingual embeddings, or a vocabulary and embedding dimensionality to randomly initialize them' ) embedding_group.add_argument('--src_vocabulary', help='the source language vocabulary') embedding_group.add_argument('--trg_vocabulary', help='the target language vocabulary') embedding_group.add_argument('--embedding_size', type=int, default=0, help='the word embedding size') # Architecture architecture_group = parser.add_argument_group( 'architecture', 'Architecture related arguments') architecture_group.add_argument( '--layers', type=int, default=2, help='the number of encoder/decoder layers (defaults to 2)') architecture_group.add_argument( '--enc_hid_dim', type=int, default=512, help='the number of dimensions for the hidden layer (defaults to 600)') architecture_group.add_argument( '--dec_hid_dim', type=int, default=512, help='the number of dimensions for the hidden layer (defaults to 600)') # Optimization optimization_group = parser.add_argument_group( 'optimization', 'Optimization related arguments') optimization_group.add_argument('--batch_size', type=int, default=128, help='the batch size (defaults to 50)') optimization_group.add_argument( '--learning_rate', type=float, default=0.0002, help='the global learning rate (defaults to 0.0002)') optimization_group.add_argument( '--dropout', metavar='PROB', type=float, default=0.4, help='dropout probability for the encoder/decoder (defaults to 0.3)') optimization_group.add_argument( '--param_init', metavar='RANGE', type=float, default=0.1, help= 'uniform initialization in the specified range (defaults to 0.1, 0 for module specific default initialization)' ) optimization_group.add_argument( '--iterations', type=int, default=50, help='the number of training iterations (defaults to 300000)') # Model saving saving_group = parser.add_argument_group( 'model saving', 'Arguments for saving the trained model') saving_group.add_argument('--save_path', metavar='PREFIX', help='save models with the given prefix') saving_group.add_argument('--save_interval', type=int, default=0, help='save intermediate models at this interval') saving_group.add_argument('--model_init_path', help='model init path') # Logging/validation logging_group = parser.add_argument_group( 'logging', 'Logging and validation arguments') logging_group.add_argument('--log_interval', type=int, default=1000, help='log at this interval (defaults to 1000)') logging_group.add_argument('--validate_batch_size', type=int, default=1, help='the batch size (defaults to 50)') corpora_group.add_argument('--inference_output', help='the source language monolingual corpus') corpora_group.add_argument('--validation_src_path', help='the source language monolingual corpus') corpora_group.add_argument('--validation_trg_path', help='the source language monolingual corpus') # Other parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--cuda', default=False, action='store_true', help='use cuda') parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--type", type=str, default='train', help="type: train/inference/debug") args = parser.parse_args() print(args) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') src_dictionary = Dictionary( [word.strip() for word in open(args.src_vocabulary).readlines()]) trg_dictionary = Dictionary( [word.strip() for word in open(args.trg_vocabulary).readlines()]) def init_weights(m): for name, param in m.named_parameters(): if 'weight' in name: nn.init.normal_(param.data, mean=0, std=0.01) else: nn.init.constant_(param.data, 0) if not args.model_init_path: attn = Attention(args.enc_hid_dim, args.dec_hid_dim) enc = Encoder(src_dictionary.size(), args.embedding_size, args.enc_hid_dim, args.dec_hid_dim, args.dropout, src_dictionary.PAD) dec = Decoder(trg_dictionary.size(), args.embedding_size, args.enc_hid_dim, args.dec_hid_dim, args.dropout, attn) s2s = Seq2Seq(enc, dec, src_dictionary.PAD, device) parallel_model = Parser(src_dictionary, trg_dictionary, s2s, device) parallel_model.apply(init_weights) else: print(f"load init model from {args.model_init_path}") parallel_model = torch.load(args.model_init_path) parallel_model = parallel_model.to(device) if args.type == TEST: test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) hit, total, acc = evaluate_iter_loss2(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) elif args.type == INFERENCE: test_dataset = customDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size) hit, total, acc = evaluate_iter_acc(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device, args.inference_output) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) elif args.type == DEBUG: test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) hit, total, acc = debug_iter(parallel_model, test_dataloader, src_dictionary, trg_dictionary, device) print(f'hit: {hit: d} | total: {total: d} | acc: {acc: f}', flush=True) else: train_dataset = treeDataset(args.src_path, args.trg_path) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=collate_fn) test_dataset = treeDataset(args.validation_src_path, args.validation_trg_path) test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.validate_batch_size, collate_fn=collate_fn) train(src_dictionary, trg_dictionary, train_dataloader, test_dataloader, parallel_model, device, args)
def test_load(self): Dictionary.load(dict_file)
torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # Config to run config = Config() if os.path.isfile(args.save): checkpoint = torch.load(args.save) if 'config' in checkpoint: print("Loading saved config") config = checkpoint['config'] print(config) # Dictionary and corpus dictionary = Dictionary() training_corpus = Corpus(args.data + "/train.txt", dictionary, create_dict=True, use_cuda=args.cuda, n_gram=config.n_gram, context_mode=config.context_mode) validation_corpus = Corpus(args.data + "/valid.txt", dictionary, create_dict=True, use_cuda=args.cuda, n_gram=config.n_gram, context_mode=config.context_mode) # TensorboardX object writer = SummaryWriter("saved_runs/" + args.save)
def main(args: argparse.Namespace): # Load input data with open(args.train_metadata, 'r') as f: train_posts = json.load(f) with open(args.val_metadata, 'r') as f: val_posts = json.load(f) # Load labels labels = {} with open(args.label_intent, 'r') as f: intent_labels = json.load(f) labels['intent'] = {} for label in intent_labels: labels['intent'][label] = len(labels['intent']) with open(args.label_semiotic, 'r') as f: semiotic_labels = json.load(f) labels['semiotic'] = {} for label in semiotic_labels: labels['semiotic'][label] = len(labels['semiotic']) with open(args.label_contextual, 'r') as f: contextual_labels = json.load(f) labels['contextual'] = {} for label in contextual_labels: labels['contextual'][label] = len(labels['contextual']) # Build dictionary from training set train_captions = [] for post in train_posts: train_captions.append(post['orig_caption']) dictionary = Dictionary(tokenizer_method="TreebankWordTokenizer") dictionary.build_dictionary_from_captions(train_captions) # Set up torch device if 'cuda' in args.device and torch.cuda.is_available(): device = torch.device(args.device) kwargs = {'pin_memory': True} else: device = torch.device('cpu') kwargs = {} # Set up number of workers num_workers = min(multiprocessing.cpu_count(), args.num_workers) # Set up data loaders differently based on the task # TODO: Extend to ELMo + word2vec etc. if args.type == 'image_only': train_dataset = ImageOnlyDataset(train_posts, labels) val_dataset = ImageOnlyDataset(val_posts, labels) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_image_only, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_image_only, **kwargs) elif args.type == 'image_text': train_dataset = ImageTextDataset(train_posts, labels, dictionary) val_dataset = ImageTextDataset(val_posts, labels, dictionary) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_image_text, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_image_text, **kwargs) elif args.type == 'text_only': train_dataset = TextOnlyDataset(train_posts, labels, dictionary) val_dataset = TextOnlyDataset(val_posts, labels, dictionary) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_text_only, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_text_only, **kwargs) # Set up the model model = Model(vocab_size=dictionary.size()).to(device) # Set up an optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma) # decay by 0.1 every 15 epochs # Set up loss function loss_fn = torch.nn.CrossEntropyLoss() # Setup tensorboard if args.tensorboard: writer = tensorboard.SummaryWriter(log_dir=args.log_dir + "/" + args.name, flush_secs=1) else: writer = None # Training loop if args.classification == 'intent': keys = ['intent'] elif args.classification == 'semiotic': keys = ['semiotic'] elif args.classification == 'contextual': keys = ['contextual'] elif args.classification == 'all': keys = ['intent', 'semiotic', 'contextual'] else: raise ValueError("args.classification doesn't exist.") best_auc_ovr = 0.0 best_auc_ovo = 0.0 best_acc = 0.0 best_model = None best_optimizer = None best_scheduler = None for epoch in range(args.epochs): for mode in ["train", "eval"]: # Set up a progress bar if mode == "train": pbar = tqdm.tqdm(enumerate(train_data_loader), total=len(train_data_loader)) model.train() else: pbar = tqdm.tqdm(enumerate(val_data_loader), total=len(val_data_loader)) model.eval() total_loss = 0 label = dict.fromkeys(keys, np.array([], dtype=np.int)) pred = dict.fromkeys(keys, None) for _, batch in pbar: if 'caption' not in batch: caption_data = None else: caption_data = batch['caption'].to(device) if 'image' not in batch: image_data = None else: image_data = batch['image'].to(device) label_batch = {} for key in keys: label_batch[key] = batch['label'][key].to(device) if mode == "train": model.zero_grad() pred_batch = model(image_data, caption_data) for key in keys: label[key] = np.concatenate((label[key], batch['label'][key].cpu().numpy())) x = pred_batch[key].detach().cpu().numpy() x_max = np.max(x, axis=1).reshape(-1, 1) z = np.exp(x - x_max) prediction_scores = z / np.sum(z, axis=1).reshape(-1, 1) if pred[key] is not None: pred[key] = np.vstack((pred[key], prediction_scores)) else: pred[key] = prediction_scores loss_batch = {} loss = None for key in keys: loss_batch[key] = loss_fn(pred_batch[key], label_batch[key]) if loss is None: loss = loss_batch[key] else: loss += loss_bath[key] total_loss += loss.item() if mode == "train": loss.backward() optimizer.step() # Terminate the progress bar pbar.close() # Update lr scheduler if mode == "train": scheduler.step() for key in keys: auc_score_ovr = roc_auc_score(label[key], pred[key], multi_class='ovr') # pylint: disable-all auc_score_ovo = roc_auc_score(label[key], pred[key], multi_class='ovo') # pylint: disable-all accuracy = accuracy_score(label[key], np.argmax(pred[key], axis=1)) print("[{} - {}] [AUC-OVR={:.3f}, AUC-OVO={:.3f}, ACC={:.3f}]".format(mode, key, auc_score_ovr, auc_score_ovo, accuracy)) if mode == "eval": best_auc_ovr = max(best_auc_ovr, auc_score_ovr) best_auc_ovo = max(best_auc_ovo, auc_score_ovo) best_acc = max(best_acc, accuracy) best_model = model best_optimizer = optimizer best_scheduler = scheduler if writer: writer.add_scalar('AUC-OVR/{}-{}'.format(mode, key), auc_score_ovr, epoch) writer.add_scalar('AUC-OVO/{}-{}'.format(mode, key), auc_score_ovo, epoch) writer.add_scalar('ACC/{}-{}'.format(mode, key), accuracy, epoch) writer.flush() if writer: writer.add_scalar('Loss/{}'.format(mode), total_loss, epoch) writer.flush() print("[{}] Epoch {}: Loss = {}".format(mode, epoch, total_loss)) hparam_dict = { 'train_split': args.train_metadata, 'val_split': args.val_metadata, 'lr': args.lr, 'epochs': args.epochs, 'batch_size': args.batch_size, 'num_workers': args.num_workers, 'shuffle': args.shuffle, 'lr_scheduler_gamma': args.lr_scheduler_gamma, 'lr_scheduler_step_size': args.lr_scheduler_step_size, } metric_dict = { 'AUC-OVR': best_auc_ovr, 'AUC-OVO': best_auc_ovo, 'ACC': best_acc } if writer: writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict) writer.flush() Path(args.output_dir).mkdir(exist_ok=True) torch.save({ 'hparam_dict': hparam_dict, 'metric_dict': metric_dict, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), }, Path(args.output_dir) / '{}.pt'.format(args.name))
import torch.nn as nn from torch.utils.tensorboard import SummaryWriter from torch.utils.data import Dataset, DataLoader from data import Dictionary from custom_embedder_recurrent import CustomEmbedder from optimizer import RAdam import tqdm import transformers tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2") gpt2 = transformers.GPT2Model.from_pretrained("gpt2") embedding = gpt2.wte vocab = tokenizer.decoder dictionary = Dictionary() dictionary.word2idx = {v: int(k) for k, v in vocab.items()} dictionary.idx2word = {int(k): v for k, v in vocab.items()} model = CustomEmbedder(dictionary, 768) embedding.weight.requires_grad = False model = model.cuda() optimizer = RAdam(model.parameters(), lr=0.001) writer = SummaryWriter() class EDataset(Dataset): def __init__(self, data): self.data = data def __len__(self):
} ROOT = 'C:\\Users\\lenovo\\.qqbot-tmp\\plugins\\' import configparser config = configparser.ConfigParser() config.read(ROOT + 'app.conf', encoding='utf8') dict_path = config.get('DICTIONARY', 'dict_path') model_save_path = config.get('MODEL', 'save_path') embedding_dim = int(config.get('MODEL', 'embedding_dim')) hidden_dim = int(config.get('MODEL', 'hidden_dim')) num_layers = int(config.get('MODEL', 'num_layers')) jieba.load_userdict(dict_path) _dict = Dictionary([]) model_dict_path = os.path.join(model_save_path, 'freq1.dict') _dict.load(model_dict_path) rep = Replier(model_save_path, _dict, num_layers, embedding_dim, hidden_dim) rep.load('freq1_(3.562772035598755)') REPLY_TIME = {} REPLY_TIME['ME'] = 0 def get_errmsg(key): limit = len(MSG[key]) - 1 idx = random.randint(0, limit) return MSG[key][idx]
from data import Dictionary, Word dict = Dictionary() words = [ Word('a', []), Word('b', [{ 'tag': 'Noun', 'defs': [{ 'def': 'abcdf', 'examples': ['1', '2'] }] }]) ] for word in words: dict.add(word) dict.save('test_dict.yaml')
class SememeDictionary(object): def __init__(self, path=None): if path == None: path = 'data/HowNet.txt' self.word2idx = {} self.idx2word = [] self.idx2freq = [] self.idx2senses = [] self.threshold = -1 self.sememe_dict = Dictionary() self.threshold = 0 file = open(path) phase = 0 re_chn = re.compile(u'[^\u4e00-\u9fa5]') cur_word = '' # add sememe for special tokens self.add_word('<unk>', ['<unk>']) self.add_word('<eos>', ['<eos>']) self.add_word('<N>', ['基数']) self.add_word('<year>', ['时间', '年', '特定']) self.add_word('<date>', ['时间', '月', '特定']) self.add_word('<hour>', ['时间', '时', '特定']) self.add_word('(', ['标点']) self.add_word('『', ['标点']) self.add_word('……', ['标点']) self.add_word('●', ['标点']) self.add_word('《', ['标点']) self.add_word('—', ['标点']) self.add_word('———', ['标点']) self.add_word('』', ['标点']) self.add_word('》', ['标点']) self.add_word('△', ['标点']) self.add_word('、', ['标点']) self.add_word(')', ['标点']) self.add_word('℃', ['标点']) self.add_word('▲', ['标点']) for line in file.readlines(): if line[0:3] == 'NO.': phase = 1 continue # new word if phase == 1 and line[0:3] == 'W_C': phase = 2 word = line[4:-1] if word == '': phase = 0 else: cur_word = word continue if phase == 2 and line[0:3] == 'DEF': phase = 3 content = line[4:-1] sememes = re_chn.split(content) sememe_bag = [] for sememe in sememes: if sememe != '': sememe_bag += [sememe] if cur_word != '': self.add_word(cur_word, sememe_bag) self.sememe_dict.idx2freq = [0] * len(self.sememe_dict) def senses_belong(self, sememes_bag, senses_bag): for i in range(len(senses_bag)): if len(set(sememes_bag + senses_bag[i])) == len(sememes_bag)\ and len(sememes_bag) == len(senses_bag[i]): return True return False def add_word(self, word, sememes_bag): if word not in self.word2idx: self.idx2word.append(word) self.idx2senses.append([]) self.idx2freq.append(0) self.word2idx[word] = len(self.idx2word) - 1 idx = self.word2idx[word] sememe_bag_idx = [] for sememe in sememes_bag: sememe_bag_idx.append(self.sememe_dict.add_word(sememe)) sememe_bag_idx = list(set(sememe_bag_idx)) if not self.senses_belong(sememe_bag_idx, self.idx2senses[idx]): self.idx2senses[idx].append(sememe_bag_idx) return self.word2idx[word] def __len__(self): return len(self.idx2word) def summary(self, print_sememes=False): print('=' * 69) print('-' * 31 + 'SUMMARY' + '-' * 31) print('Number of Sememes: {}'.format(len(self.sememe_dict))) print('Number of Words: {}'.format(len(self.idx2word))) tot_senses = 0 tot_sememes = 0 for i in range(len(self.idx2word)): tot_senses += len(self.idx2senses[i]) for j in range(len(self.idx2senses[i])): tot_sememes += len(self.idx2senses[i][j]) ws_ratio = (tot_senses + 0.0) / len(self.idx2word) ss_ratio = (tot_sememes + 0.0) / tot_senses print('Mean Senses per Word: {}'.format(ws_ratio)) print('Mean Sememes per Sense: {}'.format(ss_ratio)) print('=' * 69) if print_sememes: print(','.join(self.sememe_dict.idx2word)) def exist(self, word): return word in self.word2idx def add_word_f(self, word): if word not in self.word2idx: raise ValueError("Word don't exist") idx = self.word2idx[word] for sense in self.idx2senses[idx]: for sememe in sense: self.sememe_dict.idx2freq[sememe] += 1 self.idx2freq[self.word2idx[word]] += 1 def query_count(self, word): if word not in self.word2idx: raise ValueError("Word don't exist") return self.idx2freq[self.word2idx[word]] def freq_le(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] < k: tot += 1 return tot def freq_ge(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] >= k: tot += 1 return tot def set_threshold(self, threshold): self.threshold = threshold def sememe_word_visit(self, word_dict): sememe_word = [] sememe_sense = [] for i in range(len(self.sememe_dict)): sememe_word.append([]) sememe_sense.append([]) maximum_senses = 0 tot_senses = 0 for word_id in range(len(self.word2idx)): if self.idx2freq[word_id] >= self.threshold: maximum_senses = max(maximum_senses, len(self.idx2senses[word_id])) for sense in self.idx2senses[word_id]: for sememe in sense: sememe_word[sememe].append(word_id) sememe_sense[sememe].append(tot_senses) tot_senses += 1 tot = 0 tot_sememes = 0 max_words = 0 a = [] sememe_word_pair = [[], []] sememe_sense_pair = [[], []] sememe_idx = [] word_sense = [] for i in range(len(word_dict)): word_sense.append([]) for i in range(len(self.sememe_dict)): cur_str = self.sememe_dict.idx2word[i] cur_str += ': ' words = [] for j in range(len(sememe_word[i])): word_id = sememe_word[i][j] sense_id = sememe_sense[i][j] words.append(self.idx2word[word_id]) sememe_word_pair[0].append(tot_sememes) sememe_word_pair[1].append(word_dict[self.idx2word[word_id]]) sememe_sense_pair[0].append(tot_sememes) sememe_sense_pair[1].append(sense_id) word_sense[word_dict[self.idx2word[word_id]]].append(sense_id) tot += len(sememe_word[i]) max_words = max(max_words, len(sememe_word[i])) a += sememe_word[i] cur_str += ','.join(words) if len(set(sememe_word[i])) > 0: sememe_idx.append(tot_sememes) else: sememe_idx.append(-1) tot_sememes += len(sememe_word[i]) > 0 for i in range(len(word_dict)): word_sense[i] = list(set(word_sense[i])) print('Total words: {}'.format(len(set(a)))) print('Maximum words per sememe: {}'.format(max_words)) print('Maximum sense per word: {}'.format(maximum_senses)) print('Total respective semems: {}'.format(tot_sememes)) print('Total sememe-word pairs: {}'.format(tot)) return sememe_word_pair, sememe_idx, sememe_sense_pair, word_sense def visit(self, word, mode='full'): if word not in self.word2idx: raise ValueError('No word!') idx = self.word2idx[word] if mode == 'sbag': sememes = [] for sense in self.idx2senses[idx]: for sememe in sense: sememes.append(sememe) sememes = set(sememes) sememes_str = [] for sememe in sememes: sememes_str.append(self.sememe_dict.idx2word[sememe]) print(word + ':' + ','.join(sememes_str)) if mode == 'full': print('Word: ' + word + ', total {} means'. format(len(self.idx2senses[idx]))) for i in range(len(self.idx2senses[idx])): sememes_list = [] for j in range(len(self.idx2senses[idx][i])): sememes_list.append( self.sememe_dict.idx2word[self.idx2senses[idx][i][j]]) sememes = ','.join(sememes_list) print('Sense #{}: '.format(i + 1) + sememes)
def _insert(iterable): for w in iterable: w = Dictionary.normalize(w) if valid_words and w not in valid_words: continue words.add(w)
train_labelized = labelize_reviews(train_sentences, 'TRAIN') test_labelized = labelize_reviews(test, 'TEST') size = 400 epoch_num = 30 make_pred = False if False: model_dm, model_dbow = doc2vec_train(train_labelized, test_labelized, size, epoch_num) else: model_dm, model_dbow = doc2vec_load(size) train_doc2vecs, test_doc2vecs = get_vectors(model_dm, model_dbow, train_labelized, test_labelized, size) dictionary = Dictionary() for sentence in train_sentences + test: for word in sentence: dictionary.add_word(word) dictionary.refactor(1) print('vocab size = %d' % len(dictionary)) voc_len = len(dictionary) voc_len += 1 train_vecs = np.zeros((len(train_sentences), voc_len)) test_vecs = np.zeros((len(test), voc_len)) for i in range(len(train_sentences)): sentence = train_sentences[i] for word in sentence: train_vecs[i, dictionary.word2idx[word]] += 1 train_vecs[i, voc_len - 1] += 1 for i in range(len(test)):
if __name__ == '__main__': print '{:=^30}'.format('all args') for arg in vars(args): print ' '.join(map(str, (arg, getattr(args, arg)))) # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) corpus_path = args.data + '/' dictionary = Dictionary(corpus_path + 'train.vocab') train_iter = DataIter( corpus_path + 'train.txt', args.batch_size, args.seq_len, dictionary=dictionary, cuda=args.cuda, ) valid_iter = DataIter( corpus_path + 'valid.txt', args.batch_size, args.seq_len, dictionary=dictionary, cuda=args.cuda, )
print("Arguments: \n ", args) print("Device:", device) query_files = [ os.path.join(args.data, "train.query.txt"), os.path.join(args.data, "valid.query.txt"), os.path.join(args.data, "test.query.txt") ] if os.path.exists("./saved/dictionary.pkl"): print("Loading previously saved dictionary...") with open("./saved/dictionary.pkl", "rb") as f: dictionary = pickle.load(f) else: print("Creating dictionary...") dictionary = Dictionary(query_files) with open("./saved/dictionary.pkl", "wb") as f: pickle.dump(dictionary, f) nchar = len(dictionary) max_seq_len = dictionary.max_seq_len lr = args.lr clip = args.clip batch_size = args.batch_size eval_batch_size = 10 best_val_loss = None if args.model == 'LSTM': model = LSTMModel(nchar, args.nhid, args.nlayers, max_seq_len, args.dropout) if args.load_latest:
from os.path import dirname, join from data import Dictionary CORPUS_FOLDER = dirname(dirname(__file__)) HND_FOLDER = join(CORPUS_FOLDER, "data", "dictionaries", "hongocduc") with open(join(HND_FOLDER, "words.txt")) as f: lines = f.read().splitlines() for line in lines: open(line) Dictionary.hi()
if __name__ == '__main__': # Set the random seed manually for reproducibility. args = arg_parse() torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) corpus_path = args.data + '/' dictionary = Dictionary(corpus_path + 'vocab.c.txt') eval_batch_size = 10 train_iter = DataIter( corpus_path + 'train.txt', args.batch_size, dictionary=dictionary, cuda=args.cuda, training=True, ) valid_iter = DataIter( corpus_path + 'valid.txt', eval_batch_size, dictionary=dictionary, cuda=args.cuda,
heads = mst(S) # Predict labels select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1) select = Variable(select) selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1) _, labels = selected.max(dim=0) labels = labels.data.numpy() return heads, labels if __name__ == '__main__': data_path = '../../stanford-ptb' vocab_path = 'vocab/train' model_path = 'models/model.pt' dictionary = Dictionary(vocab_path) corpus = Corpus(data_path=data_path, vocab_path=vocab_path) model = torch.load(model_path) batches = corpus.train.batches(1) words, tags, heads, labels = next(batches) S_arc, S_lab = model(words, tags) plot(S_arc, heads) words = tags = [1, 2, 3, 4] heads_pred, labels_pred = predict(model, words, tags) print(heads_pred, '\n', heads[0].data.numpy()) print(labels_pred, '\n', labels[0].data.numpy())
'C': 'CCONJ', 'I': 'INTJ', 'E': 'ADP', 'M': 'NOUN', # số từ 'n': 'NOUN', 'S': 'NOUN' # khối } TEMP_IGNORE_POS = set([ 'R', # phụ từ tiếng Việt 'X', # không phân loại 'Z', # yếu tố cấu tạo từ 'D', # không có định nghĩa (ví dụ: chút ít) 'O', # úi chà ]) logger.info("Start loading") dict = Dictionary() pos_count = {} data = joblib.load(UTS_DICT_DATA) count = 0 logger.info("End loading") for key in data: # count += 1 # if count > 30: # break defs = [] pos_tags = {} text = key for definition in data[key]: pos_tag = definition['pos'] if pos_tag not in pos_tags: