Esempi in Python per Lang.add_sentence

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: lang

Classe/tipologia: Lang

Metodo/funzione: add_sentence

Esempi su hotexamples.com: 3

Lang.add_sentence in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per lang.Lang.add_sentence, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Lang(30)

getLang(30)

addWord(4)

add_sentence(3)

n_words(2)

make_embeddings(2)

index2word(2)

addSentence(2)

get_text(2)

word2index(2)

build_vocab(2)

getInstruction(1)

load_dict(1)

unk_data(1)

translate(1)

tokenize(1)

to_numeric(1)

reload(1)

addUsers(1)

add_word_list(1)

load_from_file(1)

lang(1)

loadLanguages(1)

get_ingredient_indices(1)

items(1)

indices2string(1)

build_emb_matrix(1)

get_vocab_size(1)

get_text_inserted(1)

get(1)

get_instruction_indices(1)

get_instance(1)

get_title_indices(1)

Esempio n. 1

Mostra file

def load_data():
    lang1 = "eng"
    lang2 = "fra"

    print("Reading lines...")

    lines = open('../assets/data/%s-%s.txt' % (lang1, lang2),
                 encoding='utf-8').read().strip().split('\n')

    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, pairs

Esempio n. 2

Mostra file

File: util.py Progetto: makky0620/spam-judgment-pytorch

def load_data():
    lines = open('../assets/SMSSpamCollection.txt').readlines()

    pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines]

    input_lang = Lang("txt")
    output_lang = Lang("label")

    for pair in pairs:
        input_lang.add_sentence(pair[1])
        output_lang.add_sentence(pair[0])

    return input_lang, output_lang, pairs

Esempio n. 3

Mostra file

class StandardDataset(Dataset):
    def __init__(self, config: Namespace, shuffle_at_init=False, seed=None):
        super(StandardDataset, self).__init__()

        self.config = config

        self.anno_lang = Lang("anno")
        self.code_lang = Lang("code")

        self.__preprocess(shuffle_at_init, seed)

    def __str__(self):
        return f"Dataset<{os.path.basename(self.config.root_dir)}>"

    def __repr__(self):
        return str(self)

    def __preprocess(self, shuffle, seed) -> None:
        anno = np.array([
            l.strip() for l in open(
                os.path.join(self.config.root_dir, "all.anno")).readlines()
        ])
        code = np.array([
            l.strip() for l in open(
                os.path.join(self.config.root_dir, "all.code")).readlines()
        ])
        assert anno.shape == code.shape

        if shuffle:
            np.random.seed(seed)
            ridx = np.random.permutation(len(anno))
            anno = anno[ridx]
            code = code[ridx]

        self.df = pd.DataFrame({"anno": anno, "code": code})

        # construct anno language
        for s in anno:
            self.anno_lang.add_sentence(s, tokenize_mode="anno")

        self.anno_lang.build_emb_matrix(emb_file=self.config.emb_file)

        # construct code language
        for s in code:
            self.code_lang.add_sentence(s, tokenize_mode="code")

        # build examples
        self.anno, self.code = [], []

        for s in anno:
            nums = self.anno_lang.to_numeric(
                s,
                tokenize_mode="anno",
                min_freq=self.config.anno_min_freq,
                pad_mode="post",
                max_len=self.config.anno_seq_maxlen,
            )
            self.anno += [torch.tensor(nums)]

        for s in code:
            nums = self.code_lang.to_numeric(
                s,
                tokenize_mode="code",
                min_freq=self.config.code_min_freq,
                pad_mode="post",
                max_len=self.config.code_seq_maxlen,
            )
            self.code += [torch.tensor(nums)]

        # construct uniform tensor
        self.anno = torch.stack(self.anno)
        self.code = torch.stack(self.code)

    def __getitem__(self, idx):
        # if lm probabilites have been computed
        if hasattr(self, "lm_probs"):
            return (
                self.anno[idx],
                self.code[idx],
                self.lm_probs["anno"][idx],
                self.lm_probs["code"][idx],
            )
        else:
            return self.anno[idx], self.code[idx]

    def __len__(self):
        assert len(self.anno) == len(self.code) == self.df.shape[0]
        return len(self.anno)

    def raw(self, idx):
        return {k: self.df.iloc[idx][k] for k in self.df.columns}

    def shuffle(self):
        r = np.random.permutation(len(self))
        self.anno = self.anno[r]
        self.code = self.code[r]
        if hasattr(self, "lm_probs"):
            self.lm_probs["anno"] = self.lm_probs["anno"][r]
            self.lm_probs["code"] = self.lm_probs["code"][r]

    def compute_lm_probs(self, lm_paths):
        """
        Compute LM probabilities for each unpadded, numericalized anno/code example.
        """

        self.lm_probs = {"anno": [], "code": []}

        pad_idx = {
            "anno": self.anno_lang.token2index["<pad>"],
            "code": self.code_lang.token2index["<pad>"],
        }

        for kind in self.lm_probs:
            lm = LMProb(lm_paths[kind])
            p = pad_idx[kind]

            for vec in tqdm(getattr(self, kind),
                            total=len(self),
                            desc=f"P({kind})"):
                self.lm_probs[kind] += [lm.get_prob(vec[vec != pad_idx[kind]])]

            self.lm_probs[kind] = torch.stack(self.lm_probs[kind])

        return self.lm_probs

    def train_test_valid_split(self, test_p: float, valid_p: float, seed=None):
        """
        Generate train/test/valid splits.

        :param test_p : percentage of all data for test
        :param valid_p: percentage of all data for train
        """
        x, y = self.anno, self.code

        sz = 1 - test_p - valid_p
        x_train, x_test_valid, y_train, y_test_valid = train_test_split(
            x, y, train_size=sz, random_state=seed)

        sz = test_p / (test_p + valid_p)
        x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid,
                                                            y_test_valid,
                                                            train_size=sz,
                                                            random_state=seed)

        assert sum(map(len, [x_train, x_test, x_valid])) == len(x)
        assert sum(map(len, [y_train, y_test, y_valid])) == len(y)

        splits = {
            "anno": {
                "train": x_train,
                "test": x_test,
                "valid": x_valid
            },
            "code": {
                "train": y_train,
                "test": y_test,
                "valid": y_valid
            },
        }

        return splits