Esempio n. 1
0
class DataModule():
    def __init__(self, train_file, dev_file, n_gram, unk_threshold, unk_val):
        self.train_file = train_file
        self.dev_file = dev_file
        self.n_gram = n_gram
        self.unk_threshold = unk_threshold
        self.unk_val = unk_val

        self.train_dl = Dataloader(self.train_file, self.n_gram,
                                   self.unk_threshold)
        self.dev_dl = Dataloader(self.dev_file, self.n_gram,
                                 self.unk_threshold)
        self.t_sents, self.t_items, self.t_types, self.t_tokens = self.train_dl.read_file(
        )
        self.d_sents, self.d_items, self.d_types, self.d_tokens = self.dev_dl.read_file(
        )

        self.t_prob = {}
        self.d_prob = {}

        # Subtracting two because it contains <BOS> and <EOS>
        self.t_total_types = len(self.t_types) - 2
        self.d_total_types = len(self.d_types) - 2

        self.t_total_tokens = len(self.t_tokens) - 2
        self.d_total_tokens = len(self.d_tokens) - 2

    # Print general stats of the dataset
    def print_stat(self):
        print("**********DATA STATISTICS********")
        print("Length of type list in train_file", self.t_total_types)
        print("Length of type list in dev_file", self.d_total_types)
        print("Length of token list in train_file", self.t_total_tokens)
        print("Length of token list in dev_file", self.d_total_tokens)
        print("OOV = ", self.t_total_tokens - self.d_total_tokens)
Esempio n. 2
0
class Backoff(DataModule):
    def __init__(self, train_file, dev_file, n_gram, unk_threshold, unk_val,
                 lambda_val, discount, norm_const):
        super().__init__(train_file, dev_file, n_gram, unk_threshold, unk_val)
        self.lambda_val = lambda_val
        self.discount = discount
        self.norm_const = norm_const
        self.t_total_length = self.t_total_types + (self.t_total_tokens *
                                                    self.lambda_val)

        if self.n_gram == 2:
            self.uni_train_dl = Dataloader(self.train_file, 1,
                                           self.unk_threshold)
            self.uni_dev_dl = Dataloader(self.dev_file, 1, self.unk_threshold)

            self.ut_sents, self.ut_items, self.ut_types, self.ut_tokens = self.uni_train_dl.read_file(
            )
            self.ud_sents, self.ud_items, self.ud_types, self.ud_tokens = self.uni_dev_dl.read_file(
            )

            self.ut_total_types = len(self.ut_types) - 2
            self.ut_total_tokens = len(self.ut_tokens) - 2

            self.ut_total_length = self.ut_total_types + (
                self.ut_total_tokens * self.lambda_val)

    # prob(X) = count(X)/count(total_types) + lambda
    def train(self):
        self.t_prob = {
            k: (v + self.lambda_val) / self.t_total_length
            for k, v in self.t_items.items()
        }
        if self.n_gram == 2:
            self.ut_prob = {
                k: (v + self.lambda_val) / self.ut_total_length
                for k, v in self.ut_items.items()
            }
        return self.t_prob

    # Calculate Perplexity
    def eval(self):
        prob_list = []
        for k, v in self.d_items.items():
            if k in self.t_prob:
                self.d_prob[k] = self.t_prob[k] - self.discount
            elif self.n_gram == 2:
                word = k[1]
                self.d_prob[k] = self.norm_const * self.ut_prob.get(
                    k, self.unk_val + self.lambda_val)
            else:
                self.d_prob[k] = self.unk_val + self.lambda_val
            prob_list.append(self.d_prob[k])

        return np.exp(-np.mean(np.log(prob_list)))