def train(args): with open(args.param_file, 'w') as f: param = vars(args) del param['handler'] json.dump(param, f, indent=4) feature_vocab = Vocabulary.load('feature.dict') category_vocab = Vocabulary.load('category.dict') data = torch.load('train.pt') pad = feature_vocab.get_index('<pad>') model = net.Classifier(feature_vocab, category_vocab, embedding_size=args.embedding_size, embedding_path=args.embedding_path, freeze_embedding=args.freeze_embedding, hidden_size=args.hidden_size, num_layers=args.num_layers, weight_dropout=args.weight_dropout) if args.gpu >= 0: model.cuda(args.gpu) print(model) optimizer = torch.optim.AdamW(model.parameters()) print(optimizer) model.train() optimizer.zero_grad() for epoch in range(args.max_epochs): loss_epoch = 0. step = 0 for batch in torch.utils.data.DataLoader( data, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn(pad), ): optimizer.zero_grad() if args.gpu >= 0: batch = move_to_cuda(batch, args.gpu) loss = net.loss_fn(model, batch) loss.backward() loss_epoch += loss.item() del loss torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() step += 1 print(f'epoch:{epoch+1}: loss:{loss_epoch:.5f}') torch.save(model.state_dict(), args.model) evaluate(args)
class SelfVocab(): def __init__(self, dataset): self.dataset = dataset self.vocab = Vocabulary() self.vocab.fromSentances(dataset.X) self.getXY() self.getEmbeddingsMatrix() def getXY(self): seqs_with_idx = self.vocab.docs_to_indices() self.X = array(seqs_with_idx, dtype=object) self.X = pad_sequences(self.X, maxlen=100) Y = [0 if y==0 else 1 for y in self.dataset.Y] self.Y = Y #array(Y) def getEmbeddingsMatrix(self): self.Wvec = self.vocab.getWord2VecMatrix()
def main(): args = get_arguments() SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8'))) print(args) args.device = list (map(str,args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) # image transformer transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time()-begin)), flush=True) savedir = os.path.join("out", args.config_name) if not os.path.exists(savedir): os.makedirs(savedir, 0o777) image = dset.embedded["image"] caption = dset.embedded["caption"] n_i = image.shape[0] n_c = caption.shape[0] all = np.concatenate([image, caption], axis=0) emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i)) save_file = os.path.join(savedir, "{}.npy".format(SETTING.method)) vis_file = os.path.join(savedir, "{}.png".format(SETTING.method)) np.save(emb_file, all) print("saved embeddings to {}".format(emb_file), flush=True) dimension_reduction(emb_file, save_file, method=SETTING.method) plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
def main(): args = get_arguments() SETTING = Dict( yaml.safe_load( open(os.path.join('arguments', args.arg + '.yaml'), encoding='utf8'))) print(args) args.device = list(map(str, args.device)) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device) transform = transforms.Compose([ transforms.Resize((SETTING.imsize, SETTING.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform) val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater) vocab = Vocabulary(max_len=SETTING.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type) capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert SETTING.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( SETTING.checkpoint), flush=True) ckpt = torch.load(SETTING.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, args.image_path, imenc, transform) retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
def prepare_data(args): ''' Do all the job about preparing data. :param args: :return: ''' trainset = REDataset(args.trainset_path, double_data=args.is_double_training_data) testset = REDataset(args.testset_path) # make vocab vocab = Vocabulary(word_num=args.vocab_word_num) corpus = [] for example in trainset: corpus += example[0] if args.vocab_include_testset: for example in testset: corpus += example[0] vocab.add_from_corpus(corpus) # make label encoder all_labels = [] for example in trainset: all_labels.append(example[1]) label_encoder = LabelEncoder(all_labels) batch_maker = BatchMaker(vocab, label_encoder, max_length=args.max_length) traindata_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers, collate_fn=batch_maker.batch_packer) testdata_loader = DataLoader(testset, batch_size=args.test_batch_size, shuffle=args.test_shuffle, num_workers=args.num_workers, collate_fn=batch_maker.batch_packer) logger.info('trainset length: %d' % len(trainset)) logger.info('testset length: %d' % len(testset)) logger.info('vocabulary length: %d'%len(vocab)) logger.info('labels num: %d'%len(label_encoder)) return (traindata_loader, testdata_loader, trainset, testset, vocab, label_encoder, batch_maker) # dataset = REDataset(TRAINSET_PATH) # corpus = [] # for example in dataset: # corpus += example[0] # # vocab = Vocabulary(word_num=3000) # vocab.add_from_corpus(corpus) # sent = dataset[0][0] # print(sent) # print(vocab.encode(sent)) # print(vocab.decode(vocab.encode(sent))) # print(dataset[0][-1])
def __init__(self, data_dir, mode, vocab_size): self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv')) self.sentences = self.df['text'].values self.labels = self.df['label'].values # Initialize dataset Vocabulary object and build our vocabulary self.sentences_vocab = Vocabulary(vocab_size) self.labels_vocab = Vocabulary(vocab_size) self.sentences_vocab.build_vocabulary(self.sentences) self.labels_vocab.build_vocabulary(self.labels, add_unk=False)
def evaluate(args): feature_vocab = Vocabulary.load('feature.dict') category_vocab = Vocabulary.load('category.dict') with open(args.param_file, 'r') as f: params = json.load(f) model = net.Classifier(feature_vocab, category_vocab, **params) model.load_state_dict(torch.load(args.model)) if args.gpu >= 0: model = model.cuda(args.gpu) test_data = torch.load('test.pt') predictions = [] targets = [] model.eval() pad = feature_vocab.get_index('<pad>') match = 0 with torch.no_grad(): for batch in torch.utils.data.DataLoader( test_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn(pad), ): if args.gpu >= 0: batch = move_to_cuda(batch, args.gpu) pred = torch.argmax(model(batch), dim=-1) target = batch['label'] match += (pred == target).sum().item() predictions.extend(pred.tolist()) targets.extend(target.tolist()) acc = match / len(targets) prec, rec, fscore, _ = precision_recall_fscore_support( predictions, targets) print('Acc', acc) print('===') print('Category', 'Precision', 'Recall', 'Fscore', sep='\t') for idx in range(len(category_vocab)): print(f'{category_vocab.get_item(idx)}\t' f'{prec[idx]:.2f}\t{rec[idx]:.2f}\t{fscore[idx]:.2f}') prec, rec, fscore, _ = precision_recall_fscore_support(predictions, targets, average='micro') print(f'Total\t{prec:.2f}\t{rec:.2f}\t{fscore:.2f}')
def build_vocab(datafile, threshold): counter = Counter() with open(datafile, 'r') as f: data = json.load(f) for caption in tqdm(list(map(lambda x: x['caption'], data))): tokens = nltk.tokenize.word_tokenize(caption.lower()) counter.update(tokens) tokens = [token for token, count in counter.items() if count >= threshold] vocab = Vocabulary() vocab.add_tokens(tokens) return vocab
def main(): args = _parse_args() assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}" with io.open(args.corpus, mode="r") as corpus: v = Vocabulary(table_size=int(2E7)) v.create(corpus, [(args.n_vocab, args.n_min_freq, args.n_min_freq)]) print(f"finished. saving models: {args.model}") v.save(args.model) # sanity check print("done. now execute sanity check...") print(f"n_vocab: {len(v)}, total_freq:{sum(v.counts)}") s = "Knox County Health Department is following national Centers for Disease Control and Prevention Protocol to contain infection." print(f"sentence: {s}") s_tokenized = "/".join(v.tokenize(s, remove_oov=False)) print(f"tokenized: {s_tokenized}") print(f"random sampling...") n_sample = 100 x = v.random_ids(n_sample) w, f = np.unique(list(map(v.id2word, x)), return_counts=True) for idx in np.argsort(f)[::-1]: print(f"{w[idx]} -> {f[idx]}") print("finished. good-bye.")
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize(args.imsize_pre), transforms.CenterCrop(args.imsize), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) if args.dataset == "coco": val_dset = CocoDataset( root=args.root_path, split="val", transform=transform, ) val_loader = DataLoader( val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater, ) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) model = SPVSE( len(vocab), args.emb_size, args.out_size, args.max_len, args.cnn_type, args.rnn_type, pad_idx=vocab.padidx, bos_idx=vocab.bosidx, ) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") model = model.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint, map_location=device) model.load_state_dict(ckpt["model_state"]) _ = validate(1000, val_loader, model, vocab, args)
def main(): args = parse_args() if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) src_vocab = Vocabulary('<unk>', eos_symbol='</s>') tgt_vocab = Vocabulary('<unk>', sos_symbol='<s>', eos_symbol='</s>') train = list( read_bitext(src_vocab, tgt_vocab, args.train_src, args.train_tgt)) src_vocab.freeze() tgt_vocab.freeze() dev = list(read_bitext(src_vocab, tgt_vocab, args.dev_src, args.dev_tgt)) # init model model = Seq2SeqAtt(src_vocab, tgt_vocab, args.src_embed_dim, args.tgt_embed_dim, args.enc_nlayers, args.enc_hidden_dim, args.dec_nlayers, args.dec_hidden_dim, args.attention_dim, args.label_smoothing) if args.saved_model: model.load_model(args.saved_model) if args.only_decode: print("Reading test data...") test = list( read_bitext(src_vocab, tgt_vocab, args.test_src, args.test_tgt)) model.translate(test, args.beam_size, args.max_output_len, args.length_norm, args.output_file, args.relative, args.absolute, args.local, args.candidate) print("Done") else: training_procedure = BasicTrainingProcedure( model, dy.SimpleSGDTrainer(model.pc)) training_procedure.train(args.epochs, train, dev, args.batch_size, args.batch_size, args.max_output_len)
def __init__(self, embed_size: int, src_vocab: Vocabulary, dst_vocab: Vocabulary): super(ModelEmbeddings, self).__init__() self.embed_size = embed_size # 默认值 src_padding_idx = src_vocab.word2idx[Vocabulary.pad_token()] dst_padding_idx = dst_vocab.word2idx[Vocabulary.pad_token()] self.src_embedding = nn.Embedding(len(src_vocab), embed_size, padding_idx=src_padding_idx) self.dst_embedding = nn.Embedding(len(dst_vocab), embed_size, padding_idx=dst_padding_idx)
def main(): args = parse_args() transform = transforms.Compose([ transforms.Resize((args.imsize, args.imsize)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if args.dataset == 'coco': val_dset = CocoDataset(root=args.root_path, imgdir='val2017', jsonfile='annotations/captions_val2017.json', transform=transform, mode='all') val_loader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_cpu, collate_fn=collater_eval) vocab = Vocabulary(max_len=args.max_len) vocab.load_vocab(args.vocab_path) imenc = ImageEncoder(args.out_size, args.cnn_type) capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size, args.rnn_type) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") imenc = imenc.to(device) capenc = capenc.to(device) assert args.checkpoint is not None print("loading model and optimizer checkpoint from {} ...".format( args.checkpoint), flush=True) ckpt = torch.load(args.checkpoint) imenc.load_state_dict(ckpt["encoder_state"]) capenc.load_state_dict(ckpt["decoder_state"]) begin = time.time() dset = EmbedDataset(val_loader, imenc, capenc, vocab, args) print("database created | {} ".format(sec2str(time.time() - begin)), flush=True) retrieve_i2c(dset, val_dset, imenc, vocab, args) retrieve_c2i(dset, val_dset, capenc, vocab, args)
def train(): with open('train_config.json') as train_config_file: train_config = json.load(train_config_file) train_data_path = train_config['train_data_path'] test_data_path = train_config['test_data_path'] vocab_path = train_config['vocab_path'] train_input_data, train_input_label = load_corpus( file_path=train_data_path, make_vocab=True, vocab_path=vocab_path) val_input_data, val_input_label = load_corpus(file_path=test_data_path, make_vocab=False) vocab = Vocabulary(vocab_path) model = Spacing(vocab_len=len(vocab)) print(model) trainer = Trainer(model=model, vocab=vocab, train_data=train_input_data, train_label=train_input_label, val_data=val_input_data, val_label=val_input_label, config=train_config) trainer.train(total_epoch=10, validation_epoch=1)
def toShakespeare(self): """Given a line of text, return that text in the indicated style. Args: modern_text: (string) The input. Returns: string: The translated text, if generated. """ args = load_arguments() vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = Model(args, vocab) model.saver.restore(sess, args.model) if args.beam > 1: decoder = beam_search.Decoder(sess, args, vocab, model) else: decoder = greedy_decoding.Decoder(sess, args, vocab, model) batch = get_batch([self.modern_text], [1], vocab.word2id) ori, tsf = decoder.rewrite(batch) out = ' '.join(w for w in tsf[0]) return out
def run_evaluation(corpus_dir, save_dir, datafile, config_file): config = Config.from_json_file(config_file) vocab = Vocabulary("words") # set checkpoint to load from; set to None if starting from scratch load_filename = os.path.join( save_dir, config.model_name, config.corpus_name, '{}-{}_{}'.format(config.encoder_n_layers, config.decoder_n_layers, config.hidden_size), 'last_checkpoint.tar') # if loading on the same machine the model trained on checkpoint = torch.load(load_filename) # if loading a model trained on gpu to cpu # checkpoint = torch.load(load_filename, map_location=torch.device('cpu')) encoder_sd = checkpoint["en"] decoder_sd = checkpoint["de"] encoder_optimizer_sd = checkpoint["en_opt"] decoder_optimizer_sd = checkpoint["de_opt"] embedding_sd = checkpoint["embedding"] vocab.__dict__ = checkpoint["voc_dict"] print("Building encoder and decoder ...") # initialize word embeddings embedding = nn.Embedding(vocab.num_words, config.hidden_size) embedding.load_state_dict(embedding_sd) # initialize encoder and decoder models encoder = EncoderRNN(config.hidden_size, embedding, config.encoder_n_layers, config.dropout) decoder = LuongAttnDecoderRNN(config.attn_model, embedding, config.hidden_size, vocab.num_words, config.decoder_n_layers, config.dropout) encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) # Set dropout layers to eval mode encoder.eval() decoder.eval() # Initialize search module searcher = GreedySearchDecoder(encoder, decoder) # Begin chatting (uncomment and run the following line to begin) evaluate_input(encoder, decoder, searcher, vocab)
def read_vocabs(self, datafile, corpus_name): lines = open(datafile, encoding="utf-8").read().strip().split('\n') pairs = [[self.normalize_string(s) for s in line.split('\t')] for line in lines] vocab = Vocabulary(corpus_name) return vocab, pairs
def __init__(self, data_path, vocab=Vocabulary(), predict=False): """ Creates an object that gets data from a file. """ super(Data, self).__init__(data_path, vocab) if not predict: self._train_test_split()
def load_word_data(questions_df, image_captions, exclude_word_list): vocab = Vocabulary() answers = Vocabulary(first_word="RELEVANT") question_seq_length = 1 caption_seq_length = 1 print "Generating vocabulary and answer indices..." new_questions = [] for _, row in questions_df.iterrows(): question_words = row['question'].split(' ') if len(question_words) > question_seq_length: question_seq_length = len(question_words) all_words = question_words image_file = row['image_file'] if image_file in image_captions: caption = image_captions[image_file] caption_words = caption.split(' ') if len(caption_words) > caption_seq_length: caption_seq_length = len(caption_words) all_words += caption_words for word in all_words: if len(word) > 0 and word not in exclude_word_list: vocab.add_word(word) # if row['relevant'] == 0: answers.add_word(row['answer']) print '\tVocab count: [%d]' % (len(vocab)) print '\tAnswers count: [%d]' % (len(answers)) print '\tQuestion sequence length: [%d]' % (question_seq_length) print '\tCaption sequence length: [%d]' % (caption_seq_length) print "Loading word vectors..." word_to_vector = load_word_vectors(word_vectors_file, vocab) print 'Creating embedding matrix...' embedding_matrix = np.zeros((len(vocab), embedding_dim)) words_not_found = [] for word, i in vocab.word_index.items(): if word not in word_to_vector: words_not_found.append(word) continue embedding_vector = word_to_vector[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector if len(words_not_found) > 0: print "Words not found:", "\n\t", words_not_found for word in words_not_found: del vocab.index_word[vocab.word_index[word]] return vocab, answers, embedding_matrix, word_to_vector, question_seq_length, caption_seq_length
def from_serializable(cls, contents): token_vocab = TokenVocabulary.from_serializable( contents["token_vocab"]) tag_vocab = Vocabulary.from_serializable(contents["tag_vocab"]) return cls(token_vocab=token_vocab, tag_vocab=tag_vocab, max_seq_len=contents["max_seq_len"])
def create_vocab(qas, threshold=4): counter = Counter() for qa in qas: question = qa['question'].encode('utf-8') answer = qa['answer'].encode('utf-8') qtokens = nltk.tokenize.word_tokenize(question.lower()) atokens = nltk.tokenize.word_tokenize(answer.lower()) counter.update(qtokens) counter.update(atokens) # If a word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Adds the words to the vocabulary. vocab = Vocabulary() for word in words: vocab.add_word(word) return vocab
def __init__(self, dataset): self.dataset = dataset self.vocab = Vocabulary() self.vocab.fromSentances(dataset.X) self.getXY() self.getEmbeddingsMatrix()
def __init__(self, data_path, vocab=Vocabulary()): self.vocab = vocab data = get_requests_from_file(data_path) print("Downloaded {} samples".format(len(data))) map_result = map(self._process_request, data) self.data = [x[0] for x in map_result] self.lengths = [x[1] for x in map_result] assert len(self.data) == len(self.lengths)
def main(): args = _parse_args() assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}" pprint(args.__dict__) vocab_params = { "power":0.75 } vocab = Vocabulary.load(args.vocab, **vocab_params) n_vocab = len(vocab) print(f"vocabulary size: {n_vocab}") kwargs = {} if args.kwargs is None else json.loads(args.kwargs) pprint(kwargs) init_params = { 'mu0': 0.1, 'sigma_mean0': 1.0, 'sigma_std0': 0.01 } model_params = { "mu_max":1.0, "sigma_min":0.1, "sigma_max":10.0, "eta":0.01, "Closs":4.0 } print("start training...") model = GaussianEmbedding(n_vocab, args.n_dim, covariance_type=args.cov_type, energy_type="KL", init_params=init_params, **model_params) with io.open(args.corpus, mode="r") as corpus: it = iter_pairs(corpus, vocab, batch_size=20, nsamples=20, window=5) model.train(it, n_workers=args.n_thread) print(f"finished. saving models: {args.model}") model.save(args.model) # sanity check print("done. now execute sanity check...") def ln_det_sigma(word): vec_sigma = model.sigma[vocab.word2id(word)] return np.sum(np.log(vec_sigma)) w = "food" print(f"word: {w}") lst_result = model.nearest_neighbors(w, vocab=vocab, sort_order="sigma", num=100) df_result = pd.DataFrame(lst_result) df_result["sigma_ln_det"] = df_result["word"].map(ln_det_sigma) print(df_result.sort_values(by="sigma_ln_det", ascending=False).head(n=10)) print("finished. good-bye.")
def convert_to_str( tensor: np.ndarray, vocab: Vocabulary, ) -> List[List[str]]: output = [] for batch in range(len(tensor)): curr = [] for idx in range(len(tensor[batch])): curr.append(vocab.idx2word(tensor[batch, idx])) output.append(curr) return output
def load_or_create_vocab(trainDataset=None, testDataset=None): Texts = list(trainDataset.anns.values()) + list(testDataset.anns.values()) if os.path.exists(VOCAB_FILE): print("loading vocab") vocab = torch.load(VOCAB_FILE) print("vocab loaded") return vocab else: vocab = Vocabulary() vocab.create_from_texts(Texts) return vocab
def __init__(self, csv_path, image_path, transform=None, batch_size=4): self.captionsfile = pd.read_csv(csv_path) self.image_path = image_path self.transform = transform self.vocab = Vocabulary(vocab_threshold=2) self.batch_size = batch_size all_tokens = [ nltk.tokenize.word_tokenize( str(self.captionsfile.iloc[index, 2]).lower()) for index in range(len(self.captionsfile)) ] self.caption_lengths = [len(token) for token in all_tokens]
class TextDataset(Dataset): def __init__(self, data_dir, mode, vocab_size): self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv')) self.sentences = self.df['text'].values self.labels = self.df['label'].values # Initialize dataset Vocabulary object and build our vocabulary self.sentences_vocab = Vocabulary(vocab_size) self.labels_vocab = Vocabulary(vocab_size) self.sentences_vocab.build_vocabulary(self.sentences) self.labels_vocab.build_vocabulary(self.labels, add_unk=False) def __len__(self): return len(self.labels) def __getitem__(self, idx): sentence = self.sentences[idx] label = self.labels[idx] #numericalize the sentence ex) ['cat', 'in', 'a', 'bag'] -> [2,3,9,24,22] numeric_sentence = self.sentences_vocab.sentence_to_numeric(sentence) numeric_label = self.labels_vocab.sentence_to_numeric(label) return torch.tensor(numeric_sentence), torch.tensor(numeric_label)
def build_vocab(words): ''' Build vocabulary and use it to format labels. ''' vocab = Vocabulary(words) # Map words to word embedding vectors. output_vector = [] for word in words: zeros = np.zeros(len(vocab), dtype=np.float32) zeros[vocab[word]] = 1.0 output_vector.append(zeros) return vocab, output_vector
def inference(): with open('train_config.json') as train_config_file: train_config = json.load(train_config_file) vocab_path = train_config['vocab_path'] model_save_path = train_config['model_save_path'] epoch = None with open(os.path.join(model_save_path, 'checkpoint.txt')) as f: epoch = f.readlines()[0].split(':')[1] print(f'Weight is loaded from best checkpoint epoch {epoch}') vocab = Vocabulary(vocab_path) model = Spacing(vocab_len=len(vocab)).eval() trainer = Trainer(model=model, vocab=vocab, config=train_config) trainer.load(epoch) while True: text = input('Enter input text : ') words = text.split() data = [] for word in words: chars = [char for char in word] data.append(chars) sorted_data = sorted(data, key=lambda e: len(e), reverse=True) idx = sorted(range(len(data)), key=lambda e: len(data[e]), reverse=True) batch_data, batch_label, lengths = trainer.make_input_tensor(sorted_data, None) outputs, _ = trainer.model.forward(batch_data, lengths) outputs = torch.round(outputs) results = [] for output, data in zip(outputs, sorted_data): result = '' for output_char, char in zip(output, data): if output_char == 1: result += (char + ' ') else: result += char results.append(result) sorted_result = '' for i in range(len(idx)): sorted_result += results[idx.index(i)] print(sorted_result)
def ngrams(prefix): """ Find n-grams and make a vocabulary from the parsed corpus """ with BZ2File(prefix + 'corpus.bz2', 'r') as corpus: vocab = Vocabulary(build_table=False) vocab.create(corpus, [(75000, 350), (25000, 350), (10000, 350)]) vocab.save(prefix + 'vocab.gz')
def __init__(self, data_path, train=False, longest_sequence_length=None): data0 = load_sent(data_path + '.0') data1 = load_sent(data_path + '.1') print( f'\n------------------------ Building a Dataset ------------------------' ) print(f'#sents of {data_path}.0 file 0: {len(data0)}' ) # list of list of tokenized words print(f'#sents of {data_path}.1 file 1: {len(data1)}' ) # list of list of tokenized words self.data_all = data0 + data1 self.style_list = [0 for i in data0] + [ 1 for i in data1 ] # data0 is all neg, data1 is all pos # sorting all the data according to their seq lengths in descending order zip_item = zip(self.data_all, self.style_list) sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True) tuple_item = zip(*sorted_item) self.data_all, self.style_list = [list(t) for t in tuple_item] print(f'len(self.data_all) : {len(self.data_all)}') print(f'len(self.style_list): {len(self.style_list)}') if train: print('\ntrain: True') if not os.path.isfile(cfg.vocab): print(f'{cfg.vocab} does not exist') print('Building Vocab...') build_vocab(data0 + data1, cfg.vocab) else: print(f'{cfg.vocab} already exists') self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim) print('\nvocabulary size:', self.vocab.size) print( f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}') # print(type(self.vocab.embedding)) # np array self.longest_sequence_length = longest_sequence_length if longest_sequence_length is None: self.update_the_max_length() print(f'self.longest_sequence_length: {self.longest_sequence_length}') print( f'--------------------------------------------------------------------' )
def count(prefix): """ Count the number of tokens in the corpus """ vocab = Vocabulary.load(prefix + 'vocab.gz', build_table=False) total = 0 ndocs = 0 with BZ2File(prefix + 'corpus.bz2', 'r') as corpus: for doc in corpus: tokens = vocab.tokenize(doc) total += len(tokens) ndocs += 1 if ndocs % 10000 == 0: logger.info("Processed %s docs." % ndocs) logger.info("Total of %s tokens in corpus" % total)
input_sentence = 'the dog ran'.split() rules = [ ('S', ['NP', 'VP']), ('VP', ['V']), ('NP', ['DET', 'N']), ] words = { 'N': 'man dog'.split(), 'DET': 'a the'.split(), 'V': 'ran saw'.split(), } vocab = Vocabulary(D) for category, items in words.items(): for item in items: vocab.add(item, vocab.parse('CATEGORY*%s+TEXT*text_%s'%(category, item))) sp_goal = vocab.parse('S') for input in input_sentence: print 'parsing text:', input sp_lex = vocab.parse(input) category = sp_lex*vocab.parse('~CATEGORY') while True:
clues1 = clues1[:len(clues4)] clues2 = clues2[:len(clues4)] print len(clues1), len(clues2), len(clues4) oldclues = clues2 + clues4 clues = clues2 + clues4 #vocab = Vocabulary() #clues = [] #for i in oldclues: #if i not in clues: #clues.append(i) #for clue in oldclues: #vocab.add_question(clue) path = Path(args.foldername) if not path.exists(): path.mkdir() vocab = Vocabulary.load(inputpath3 / "vocab.pkl") vocab.save(path / "vocab.pkl") print vocab.number matrix = lil_matrix((len(clues), vocab.number + 1)) matrix = matrix.astype(np.int64) i = 0 for clue in tqdm(clues): vector = vocab.translate(clue) matrix[i] = vector i += 1 print matrix.shape matrix = csr_matrix(matrix) #np.save(path / "matrix.npy", matrix) #matrix = pickle.load(open(inputpath4 / "matrix.pkl", "rb")) #print matrix.shape #idx = np.arange(matrix.shape[1])
with open(test_file,'r') as fin: if hasTitle: fin.readline() for line in fin: lline = line.strip().split('\t') id_1 = self._vocab.word2id(lline[0].lower()) id_2 = self._vocab.word2id(lline[1].lower()) score_human.append(float(lline[2])) if flag == 'cosine': s = cosine(self.mu[id_1,:], self.mu[id_2,:]) elif flag == 'IP': s = logIP(self.mu[id_1,:],self.sigma[id_1,:], self.mu[id_2,:],self.sigma[id_2,:]) score_model.append(s) coeff = stats.pearsonr(numpy.array(score_human),numpy.array(score_model)) print coeff if __name__ == '__main__': from vocab import Vocabulary import os work_dir = os.getcwd() test_file = work_dir + '/dataset/wordsim353/combined' vocab = Vocabulary.load(work_dir + '/dataset/vocab.new.gz') embed = Embedding(work_dir + '/embedding_result/Result/May5/embedding.tar.gz', vocab)
# -*- coding: utf-8 -*- """ Created on Mon Feb 29 17:02:33 2016 @author: whr94621 """ import logging logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) from gzip import GzipFile from word2gauss import GaussianEmbedding, iter_pairs from vocab import Vocabulary import sys vocab = Vocabulary.load(sys.argv[2]) embed = GaussianEmbedding(len(vocab),50,covariance_type='diagonal',energy_type='KL',mu_max = 4.0, sigma_min = 1, sigma_max = 2) with GzipFile(sys.argv[1], 'r') as corpus: for i in xrange(50): embed.train(iter_pairs(corpus, vocab, iterations=20), n_workers=16) embed.save('embedding.tar.gz', vocab=vocab.id2word, full=True)
args = argparser.parse_args() path = Path(args.foldername) if not path.exists(): path.mkdir() with open("%s/clues.pkl" % args.inputname, "rb") as f: clues = pickle.load(f) docs = np.load("%s/docs.npy" % args.inputname) topics = np.load("%s/topics.npy" % args.inputname) for i in range(int(args.numtopics)): print "ITERATION %d" % i subclues = [] for num in np.argsort(docs.transpose()[i])[::-1]: if docs.transpose()[i][num] < 0.5: continue subclues.append(clues[num]) newvocab = Vocabulary() for clue in subclues: newvocab.add_question(clue) print newvocab.number matrix = lil_matrix((len(subclues), newvocab.number + 1)) matrix = matrix.astype(np.int64) j = 0 for clue in subclues: vector = newvocab.translate(clue) matrix[j] = vector j += 1 print matrix.shape matrix = csr_matrix(matrix) #svd = TruncatedSVD(n_components=700) #docs = svd.fit_transform(matrix) model = lda.LDA(n_topics=10, n_iter=10000)
def print_tree(s, depth=0): x = label(s) if x is None: x = label_word(s) if x is not None: print ' '*depth+x return print ' '*depth+label(s) print_tree(s*vocab.parse('~L_'+x), depth+1) print_tree(s*vocab.parse('~R_'+x), depth+1) vocab = Vocabulary(D) NEXT = vocab.parse('NEXT') for category, items in words.items(): for item in items: rules.append((category, [item])) print rules sp_goal = vocab.parse('S') sp_tree = None for input in input_sentence: print 'parsing text:', input