def __init__(self,
                 train=False,
                 save=False,
                 corpus_path=CORPUS_PATH,
                 threshold=2):

        self.slang_dict = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             "pickled/_slang_words.p"), "rb"))
        self.slang_dict['dr'] = 'dari'
        self.slang_dict['k'] = 'ke'
        self.slang_dict['sc'] = 'sesar'

        if train:
            create_dictionary.main()
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "rb"))
            self.counter = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "rb"))
            self.model = model.LanguageModel(load=True)

        try:
            for key in self.counter:
                if self.counter[key] <= threshold:
                    self.words.remove(key)
        except:
            pass

        self.candidates_dict = {}

        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7

        # create object
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        # load dictionary
        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "corpus/dictionary/dictionary.txt")
        # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index, count_index, encoding="utf-8"):
            print("Dictionary file not found")
            return

        if save == True:
            self.save()
Beispiel #2
0
    def __init__(self, load=False, corpus_path=CORPUS_PATH):
        if load is False:
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(open("pickled/_spell_words.p", "rb"))
            self.counter = pickle.load(open("pickled/_spell_counter.p", "rb"))
            self.model = model.LanguageModel(load=True)

        self.candidates_dict = {}
Beispiel #3
0
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if args.cuda:
        data = data.cuda()
    return data


eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

# Build the model
ntokens = len(corpus.dictionary)
model = model.LanguageModel(ntokens, args.emsize, args.nhid, args.rec_depth,
                            args.nlayers, args.dropout)
if args.cuda:
    model.cuda()

criterion = nn.CrossEntropyLoss()

# Training code

# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
Beispiel #4
0
                        vocab_size=len(output_w2i),
                        use_attn=args.use_attn)

if args.shallow_fusion or args.deep_fusion:
  s2s = model.Model(encoder=encoder,
                    policy=policy,
                    decoder=decoder,
                    input_w2i=input_w2i,
                    output_w2i=output_w2i,
                    args=args)
  lm_decoder = model.Decoder(emb_size=args.emb_size,
                             hid_size=args.hid_size,
                             vocab_size=len(output_w2i),
                             use_attn=False)
  lm = model.LanguageModel(decoder=lm_decoder,
                           input_w2i=input_w2i,
                           output_w2i=output_w2i,
                           args=args)
  if args.shallow_fusion:
    model = model.ShallowFusionModel(s2s, lm, args)
  elif args.deep_fusion:
    model = model.DeepFusionModel(s2s, lm, args)
elif args.cold_fusion:
  s2s = model.Model(encoder=encoder,
                    policy=policy,
                    decoder=decoder,
                    input_w2i=input_w2i,
                    output_w2i=output_w2i,
                    args=args)
  lm_decoder = model.Decoder(emb_size=args.emb_size,
                             hid_size=args.hid_size,
                             vocab_size=len(output_w2i),
Beispiel #5
0
epochs = 10
save = 'model.pt'
dict_save = 'dict.pickle'
onnx_export = ''
emsize = 200
nhid = 200
nlayers = 3
dropout = 0.2

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

model = model.LanguageModel(ntokens,
                            emsize,
                            nhid,
                            nlayers,
                            dropout,
                            tie_weights=True)
criterion = nn.CrossEntropyLoss()

if cuda:
    model.cuda()


def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i + seq_len]
    target = source[i + 1:i + 1 + seq_len].view(-1)
    return data, target