def __init__(self, train=False, save=False, corpus_path=CORPUS_PATH, threshold=2): self.slang_dict = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_slang_words.p"), "rb")) self.slang_dict['dr'] = 'dari' self.slang_dict['k'] = 'ke' self.slang_dict['sc'] = 'sesar' if train: create_dictionary.main() self.words = self.__words(corpus_path) self.counter = self.__counter(self.words) self.model = model.LanguageModel(corpus_path=corpus_path) else: self.words = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "rb")) self.counter = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "rb")) self.model = model.LanguageModel(load=True) try: for key in self.counter: if self.counter[key] <= threshold: self.words.remove(key) except: pass self.candidates_dict = {} # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary( dictionary_path, term_index, count_index, encoding="utf-8"): print("Dictionary file not found") return if save == True: self.save()
def __init__(self, load=False, corpus_path=CORPUS_PATH): if load is False: self.words = self.__words(corpus_path) self.counter = self.__counter(self.words) self.model = model.LanguageModel(corpus_path=corpus_path) else: self.words = pickle.load(open("pickled/_spell_words.p", "rb")) self.counter = pickle.load(open("pickled/_spell_counter.p", "rb")) self.model = model.LanguageModel(load=True) self.candidates_dict = {}
data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() if args.cuda: data = data.cuda() return data eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) # Build the model ntokens = len(corpus.dictionary) model = model.LanguageModel(ntokens, args.emsize, args.nhid, args.rec_depth, args.nlayers, args.dropout) if args.cuda: model.cuda() criterion = nn.CrossEntropyLoss() # Training code # get_batch subdivides the source data into chunks of length args.bptt. # If source is equal to the example output of the batchify function, with # a bptt-limit of 2, we'd get the following two Variables for i = 0: # ┌ a g m s ┐ ┌ b h n t ┐ # └ b h n t ┘ └ c i o u ┘ # Note that despite the name of the function, the subdivison of data is not # done along the batch dimension (i.e. dimension 1), since that was handled # by the batchify function. The chunks are along dimension 0, corresponding
vocab_size=len(output_w2i), use_attn=args.use_attn) if args.shallow_fusion or args.deep_fusion: s2s = model.Model(encoder=encoder, policy=policy, decoder=decoder, input_w2i=input_w2i, output_w2i=output_w2i, args=args) lm_decoder = model.Decoder(emb_size=args.emb_size, hid_size=args.hid_size, vocab_size=len(output_w2i), use_attn=False) lm = model.LanguageModel(decoder=lm_decoder, input_w2i=input_w2i, output_w2i=output_w2i, args=args) if args.shallow_fusion: model = model.ShallowFusionModel(s2s, lm, args) elif args.deep_fusion: model = model.DeepFusionModel(s2s, lm, args) elif args.cold_fusion: s2s = model.Model(encoder=encoder, policy=policy, decoder=decoder, input_w2i=input_w2i, output_w2i=output_w2i, args=args) lm_decoder = model.Decoder(emb_size=args.emb_size, hid_size=args.hid_size, vocab_size=len(output_w2i),
epochs = 10 save = 'model.pt' dict_save = 'dict.pickle' onnx_export = '' emsize = 200 nhid = 200 nlayers = 3 dropout = 0.2 train_data = batchify(corpus.train, batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) model = model.LanguageModel(ntokens, emsize, nhid, nlayers, dropout, tie_weights=True) criterion = nn.CrossEntropyLoss() if cuda: model.cuda() def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target