def __init__(self, path, vocab_path=None, batch_size=1, shuffle=False, pin_memory=False, update_vocab=False, min_freq=1, concat=False, bptt=35): self.batch_size = batch_size self.shuffle = shuffle self.pin_memory = pin_memory self.base_path = path self.update_vocab = update_vocab self.bptt = bptt self.concat = concat self.vocab = get_vocab(path, ['train.txt'], min_freq=min_freq, vocab_file=vocab_path) if self.concat: # set the frequencies for special tokens by miracle trial self.vocab.idx2count[1] = self.vocab.freqs[BOS] # <s> self.vocab.idx2count[2] = 0 # </s> self.train = self.get_dataloader('train.txt', self.batch_size) self.valid = self.get_dataloader('valid.txt', 1) self.test = self.get_dataloader('test.txt', 1)
def _gen_embedding(ndim, alignment=False): print "Generating %d-dim word embedding ..." %ndim int2ch, ch2int = get_vocab() ch_lists = [] quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) if alignment: # the i-th characters in the poem, used to boost Dui Zhang i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))] for characters in i_characters: ch_lists.append(filter(lambda ch: ch in ch2int, characters)) if 0 == (idx+1)%10000: print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size = ndim, min_count = 5) embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) for idx, ch in enumerate(int2ch): if ch in model.wv: embedding[idx,:] = model.wv[ch] if alignment: model.save(_w2v_with_alignment_model_path) print "Word2Vec model is saved." np.save(_w2v_with_alignment_path, embedding) print "Word embedding is saved." else: model.save(_w2v_model_path) print "Word2Vec model is saved." np.save(_w2v_path, embedding) print "Word embedding is saved."
def __init__(self, path: str, vocab_path: str) -> None: self.model = kenlm.Model(path) def probability_function(tokens: List[str]) -> float: return self.model.score(" ".join(tokens)) super().__init__(self.model, probability_function=probability_function, vocab=get_vocab(vocab_path))
def get_quatrains(): _, ch2int = get_vocab() def quatrain_filter(poem): if not is_quatrain(poem): return False else: for sentence in poem['sentences']: for ch in sentence: if ch not in ch2int: return False return True return list(filter(quatrain_filter, get_all_corpus()))
def get_quatrains(): # 返回每个字符都在字库ch2int中的四行诗的诗句 _, ch2int = get_vocab() def quatrain_filter(poem): if not is_quatrain(poem): return False else: for sentence in poem['sentences']: for ch in sentence: if ch not in ch2int: return False return True return filter(quatrain_filter, get_all_corpus()) # get_all_corpus()方法返回的是所有诗句文件数据中的诗的记录,每一行代表一首诗的名、作者、朝代、诗句
def get_deck(jlpt_level): vocab = get_vocab(jlpt_level) deck = genanki.Deck(DECK_BASE_ID + jlpt_level, 'JLPT Vocab::N{}'.format(jlpt_level)) media = [] for v in vocab: if v.path is not None: media.append(v.path) audio = '[sound:{}]'.format(v.path) else: audio = '' note = KanjiNote( model=VOCAB_MODEL, fields=[str(v.id), v.kana, v.kanji, ', '.join(v.pos), v.defn, audio] ) deck.add_note(note) return deck, media
def _gen_embedding(ndim): print "Generating %d-dim word embedding ..." % ndim int2ch, ch2int = get_vocab() ch_lists = [] quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) if 0 == (idx + 1) % 10000: print "[Word2Vec] %d/%d poems have been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size=ndim, min_count=5) embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) for idx, ch in enumerate(int2ch): if ch in model.wv: embedding[idx, :] = model.wv[ch] np.save(_w2v_path, embedding) print "Word embedding is saved."
def _gen_embedding(ndim): # 生成ndim维度的词向量 print "Generating %d-dim word embedding ..." % ndim int2ch, ch2int = get_vocab() # 得到词库 ch_lists = [] quatrains = get_quatrains() # 得到所有符合要求规则的四行诗的诗句 for idx, poem in enumerate(quatrains): # 对于四行诗中的每一首诗 for sentence in poem['sentences']: # 对于诗中的每一句诗 ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) # 检查诗句的每一行中哪些在ch2int词典中 if 0 == (idx + 1) % 10000: print "[Word2Vec] %d/%d poems have been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size=ndim, min_count=5) # ch_list是词库,ndim是要生成的词向量的维度 embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) # 平均分布的矩阵,每一行代表一个词向量,每一个词向量维度ndim for idx, ch in enumerate(int2ch): if ch in model.wv: # 如果int2ch中的该词在model生成的词向量中 embedding[idx, :] = model.wv[ch] # embedding中的该行代表ch对应的词向量 np.save(_w2v_path, embedding) print "Word embedding is saved."
def __init__(self, path, vocab_path=None, batch_size=1, shuffle=False, pin_memory=False, update_vocab=False, min_freq=1, concat=False, bptt=35): self.batch_size = batch_size self.shuffle = shuffle self.pin_memory = pin_memory self.base_path = path self.update_vocab = update_vocab self.bptt = bptt self.concat = concat self.vocab = get_vocab(path, ['train.txt'], min_freq=min_freq, vocab_file=vocab_path) self.train = self.get_dataloader('train.txt', self.batch_size) self.valid = self.get_dataloader('valid.txt', 1) self.test = self.get_dataloader('test.txt', 1)
def __init__(self): self.int2ch, self.ch2int = get_vocab(if_segment)
TRAIN ACC: 90.906 VALID ACC: 91.181 LOSS: 0.02215 """ s = "TRAIN ACC: {: 3.3f} VALID ACC: {: 3.3f} LOSS: {: 3.5f}" print(s.format(100 * train_acc, 100 * valid_acc, loss)) ################################################################################ # DATA ################################################################################ hyper = load_hyper_params(HYPERPARAMS_FILE) # LOAD VOCAB # TODO: make vocab files contain FULL vocab from imdb # And make get_vocab() load only the first MAX_VOCAB words id2word, word2id = get_vocab(VOCAB_FILE, DATA_DIR, hyper["MAX_VOCAB"]) n_words = len(id2word) # CLASS MAPPINGS id2class = ["neg", "pos"] class2id = {label: id for id, label in enumerate(id2class)} # LOAD DATA data = get_data(DATA_DIR, CACHED_DATA, vocab_file=VOCAB_FILE) limit_data_vocab(data, n=hyper["MAX_VOCAB"], unknown_id=1) n_samples = len(data["xtrain"]) ################################################################################ # MODEL ################################################################################ model = Model(n_vocab=n_words,
parser.add_argument('--batch_size', help="Enter the batch size", type=int, default=64) parser.add_argument('--epochs', help="Enter the number of epochs", type=int, default=2) args = parser.parse_args() # path = 'data/eng-fra.txt' train, val, test = data_preprocess.split(args.path) eng_lm = spacy.load('en') fre_lm = spacy.load('fr') w2i_eng_train, _, w2i_fre_train, _ = vocab.get_vocab(train, eng_lm, fre_lm) w2i_eng_val, i2w_eng_val, w2i_fre_val, i2w_fre_val = vocab.get_vocab( val, eng_lm, fre_lm) w2i_eng_test, i2w_eng_test, w2i_fre_test, i2w_fre_test = vocab.get_vocab( test, eng_lm, fre_lm) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') inp_vocab_dim = len(w2i_eng_train) label_vocab_dim = len(w2i_fre_train) m = model.enc_dec_attn(args.enc_hid, args.dec_hid, args.emb_dim, args.drop_prob, device, inp_vocab_dim, label_vocab_dim) # print(m) # print(f'The model has {model.count_parameters(m):,} trainable parameters')