Esempi in Python per Lang.build_emb_matrix

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: lang

Classe/tipologia: Lang

Metodo/funzione: build_emb_matrix

Esempi su hotexamples.com: 1

Lang.build_emb_matrix in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per lang.Lang.build_emb_matrix, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Lang(30)

getLang(30)

addWord(4)

add_sentence(3)

n_words(2)

make_embeddings(2)

index2word(2)

addSentence(2)

get_text(2)

word2index(2)

build_vocab(2)

getInstruction(1)

load_dict(1)

unk_data(1)

translate(1)

tokenize(1)

to_numeric(1)

reload(1)

addUsers(1)

add_word_list(1)

load_from_file(1)

lang(1)

loadLanguages(1)

get_ingredient_indices(1)

items(1)

indices2string(1)

build_emb_matrix(1)

get_vocab_size(1)

get_text_inserted(1)

get(1)

get_instruction_indices(1)

get_instance(1)

get_title_indices(1)

Esempio n. 1

Mostra file

class StandardDataset(Dataset):
    def __init__(self, config: Namespace, shuffle_at_init=False, seed=None):
        super(StandardDataset, self).__init__()

        self.config = config

        self.anno_lang = Lang("anno")
        self.code_lang = Lang("code")

        self.__preprocess(shuffle_at_init, seed)

    def __str__(self):
        return f"Dataset<{os.path.basename(self.config.root_dir)}>"

    def __repr__(self):
        return str(self)

    def __preprocess(self, shuffle, seed) -> None:
        anno = np.array([
            l.strip() for l in open(
                os.path.join(self.config.root_dir, "all.anno")).readlines()
        ])
        code = np.array([
            l.strip() for l in open(
                os.path.join(self.config.root_dir, "all.code")).readlines()
        ])
        assert anno.shape == code.shape

        if shuffle:
            np.random.seed(seed)
            ridx = np.random.permutation(len(anno))
            anno = anno[ridx]
            code = code[ridx]

        self.df = pd.DataFrame({"anno": anno, "code": code})

        # construct anno language
        for s in anno:
            self.anno_lang.add_sentence(s, tokenize_mode="anno")

        self.anno_lang.build_emb_matrix(emb_file=self.config.emb_file)

        # construct code language
        for s in code:
            self.code_lang.add_sentence(s, tokenize_mode="code")

        # build examples
        self.anno, self.code = [], []

        for s in anno:
            nums = self.anno_lang.to_numeric(
                s,
                tokenize_mode="anno",
                min_freq=self.config.anno_min_freq,
                pad_mode="post",
                max_len=self.config.anno_seq_maxlen,
            )
            self.anno += [torch.tensor(nums)]

        for s in code:
            nums = self.code_lang.to_numeric(
                s,
                tokenize_mode="code",
                min_freq=self.config.code_min_freq,
                pad_mode="post",
                max_len=self.config.code_seq_maxlen,
            )
            self.code += [torch.tensor(nums)]

        # construct uniform tensor
        self.anno = torch.stack(self.anno)
        self.code = torch.stack(self.code)

    def __getitem__(self, idx):
        # if lm probabilites have been computed
        if hasattr(self, "lm_probs"):
            return (
                self.anno[idx],
                self.code[idx],
                self.lm_probs["anno"][idx],
                self.lm_probs["code"][idx],
            )
        else:
            return self.anno[idx], self.code[idx]

    def __len__(self):
        assert len(self.anno) == len(self.code) == self.df.shape[0]
        return len(self.anno)

    def raw(self, idx):
        return {k: self.df.iloc[idx][k] for k in self.df.columns}

    def shuffle(self):
        r = np.random.permutation(len(self))
        self.anno = self.anno[r]
        self.code = self.code[r]
        if hasattr(self, "lm_probs"):
            self.lm_probs["anno"] = self.lm_probs["anno"][r]
            self.lm_probs["code"] = self.lm_probs["code"][r]

    def compute_lm_probs(self, lm_paths):
        """
        Compute LM probabilities for each unpadded, numericalized anno/code example.
        """

        self.lm_probs = {"anno": [], "code": []}

        pad_idx = {
            "anno": self.anno_lang.token2index["<pad>"],
            "code": self.code_lang.token2index["<pad>"],
        }

        for kind in self.lm_probs:
            lm = LMProb(lm_paths[kind])
            p = pad_idx[kind]

            for vec in tqdm(getattr(self, kind),
                            total=len(self),
                            desc=f"P({kind})"):
                self.lm_probs[kind] += [lm.get_prob(vec[vec != pad_idx[kind]])]

            self.lm_probs[kind] = torch.stack(self.lm_probs[kind])

        return self.lm_probs

    def train_test_valid_split(self, test_p: float, valid_p: float, seed=None):
        """
        Generate train/test/valid splits.

        :param test_p : percentage of all data for test
        :param valid_p: percentage of all data for train
        """
        x, y = self.anno, self.code

        sz = 1 - test_p - valid_p
        x_train, x_test_valid, y_train, y_test_valid = train_test_split(
            x, y, train_size=sz, random_state=seed)

        sz = test_p / (test_p + valid_p)
        x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid,
                                                            y_test_valid,
                                                            train_size=sz,
                                                            random_state=seed)

        assert sum(map(len, [x_train, x_test, x_valid])) == len(x)
        assert sum(map(len, [y_train, y_test, y_valid])) == len(y)

        splits = {
            "anno": {
                "train": x_train,
                "test": x_test,
                "valid": x_valid
            },
            "code": {
                "train": y_train,
                "test": y_test,
                "valid": y_valid
            },
        }

        return splits