Python Vocabulary.fromlistの例

プログラミング言語: Python

名前空間/パッケージ名: utils.vocabulary

クラス/型: Vocabulary

メソッド/関数: fromlist

hotexamples.comのコード掲載数: 2

Python Vocabulary.fromlist - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのutils.vocabulary.Vocabulary.fromlistの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Vocabulary(30)

load(18)

save(14)

build(10)

process_sentence(7)

load_vocabulary(3)

new(3)

size(2)

add_word(2)

add_words(2)

build_vocabulary_from_tokens(2)

compute_frequency(2)

fromlist(2)

load_glove_vocabulary(1)

merge_vocabularies(1)

save_counts(1)

observe_word(1)

setup_corpus_vocabulary(1)

ix2sent_drop_pad(1)

sent2ix(1)

sent2ix_andpad(1)

save_vocab(1)

get_word(1)

index(1)

get_char_vocab(1)

add(1)

add_token(1)

build_from_scratch(1)

construct_embedding_matrix(1)

freeze(1)

from_serializable(1)

get_index(1)

has_word(1)

get_language(1)

get_pad(1)

get_sentence(1)

get_unk(1)

abstract2sents(1)

get_word_vocab(1)

type_to_id(1)

コード例 #1

ファイルを表示

    def build_corpus(self):
        print(f'Loading training trees from `{self.train_path}`...')
        if self.multitask == 'ccg':
            train_treebank = ccg.fromfile(self.train_path)
        else:
            with open(self.train_path) as f:
                train_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading development trees from `{self.dev_path}`...')
        with open(self.dev_path) as f:
            dev_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading test trees from `{self.test_path}`...')
        with open(self.test_path) as f:
            test_treebank = [fromstring(line.strip()) for line in f]

        if self.multitask == 'spans':
            # need trees with span-information
            train_treebank = [tree.convert() for tree in train_treebank]
            dev_treebank = [tree.convert() for tree in dev_treebank]
            test_treebank = [tree.convert() for tree in test_treebank]

        print("Constructing vocabularies...")
        if self.vocab_path is not None:
            print(f'Using word vocabulary specified in `{self.vocab_path}`')
            with open(self.vocab_path) as f:
                vocab = json.load(f)
            words = [word for word, count in vocab.items() for _ in range(count)]
        else:
            words = [word for tree in train_treebank for word in tree.words()]

        if self.multitask == 'none':
            labels = []
        else:
            labels = [label for tree in train_treebank for label in tree.labels()]

        if self.multitask == 'none':
            words = [UNK, START] + words
        else:
            words = [UNK, START, STOP] + words

        word_vocab = Vocabulary.fromlist(words, unk_value=UNK)
        label_vocab = Vocabulary.fromlist(labels)

        self.word_vocab = word_vocab
        self.label_vocab = label_vocab

        self.train_treebank = train_treebank
        self.dev_treebank = dev_treebank
        self.test_treebank = test_treebank

        print('\n'.join((
            'Corpus statistics:',
            f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals',
            f'Train: {len(train_treebank):,} sentences',
            f'Dev: {len(dev_treebank):,} sentences',
            f'Test: {len(test_treebank):,} sentences')))

コード例 #2

ファイルを表示

ファイル: supervised.py プロジェクト: daandouwe/thesis

    def build_corpus(self):
        print(f'Loading training trees from `{self.train_path}`...')
        with open(self.train_path) as f:
            train_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading development trees from `{self.dev_path}`...')
        with open(self.dev_path) as f:
            dev_treebank = [fromstring(line.strip()) for line in f]

        print(f'Loading test trees from `{self.test_path}`...')
        with open(self.test_path) as f:
            test_treebank = [fromstring(line.strip()) for line in f]

        if self.unlabeled:
            print(f'Converting trees to unlabeled form...')
            for tree in train_treebank:
                tree.unlabelize()

        if self.model_type == 'crf':
            print(f'Converting trees to CNF...')
            train_treebank = [tree.cnf() for tree in train_treebank]

            if self.unlabeled:
                for tree in train_treebank:
                    tree.remove_chains()

        print("Constructing vocabularies...")
        if self.vocab_path is not None:
            print(f'Using word vocabulary specified in `{self.vocab_path}`')
            with open(self.vocab_path) as f:
                vocab = json.load(f)
            words = [
                word for word, count in vocab.items() for _ in range(count)
            ]
        else:
            words = [word for tree in train_treebank for word in tree.words()]

        if self.max_sent_len > 0:
            filtered_treebank = [
                tree for tree in train_treebank
                if len(tree.words()) <= self.max_sent_len
            ]

            print(
                "Using sentences with length <= {}: {:.1%} of all training trees."
                .format(self.max_sent_len,
                        len(filtered_treebank) / len(train_treebank)))

            train_treebank = filtered_treebank

        if self.min_label_count > 1:
            counted_labels = Counter(
                [label for tree in train_treebank for label in tree.labels()])
            filtered_labels = [
                label for label, count in counted_labels.most_common()
                if count >= self.min_label_count
            ]
            filtered_treebank = [
                tree for tree in train_treebank
                if all(label in filtered_labels for label in tree.labels())
            ]

            print(
                "Using labels with count >= {}: {}/{} ({:.1%}) of all labels and {:.1%} of all training trees."
                .format(self.min_label_count, len(filtered_labels),
                        len(counted_labels),
                        len(filtered_labels) / len(counted_labels),
                        len(filtered_treebank) / len(train_treebank)))

            train_treebank = filtered_treebank

        labels = [label for tree in train_treebank for label in tree.labels()]

        if self.model_type == 'crf':
            words = [UNK, START, STOP] + words
        else:
            words = [UNK] + words

        word_vocab = Vocabulary.fromlist(words, unk_value=UNK)
        label_vocab = Vocabulary.fromlist(labels)

        ##
        # counted_labels = Counter(label_vocab.counts).most_common()
        # pprint(counted_labels)
        ##

        if self.model_type.endswith('rnng'):
            # Order is very important! See DiscParser/GenParser classes to know why.
            if self.model_type == 'disc-rnng':
                actions = [SHIFT, REDUCE
                           ] + [NT(label) for label in label_vocab]
            elif self.model_type == 'gen-rnng':
                actions = [REDUCE] + [NT(label) for label in label_vocab
                                      ] + [GEN(word) for word in word_vocab]
            action_vocab = Vocabulary()
            for action in actions:
                action_vocab.add(action)
        else:
            action_vocab = Vocabulary()

        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        self.action_vocab = action_vocab

        self.train_treebank = train_treebank
        self.dev_treebank = dev_treebank
        self.test_treebank = test_treebank

        print('\n'.join((
            'Corpus statistics:',
            f'Vocab: {word_vocab.size:,} words, {label_vocab.size:,} nonterminals, {action_vocab.size:,} actions',
            f'Train: {len(train_treebank):,} sentences',
            f'Dev: {len(dev_treebank):,} sentences',
            f'Test: {len(test_treebank):,} sentences')))