Python make_vocab_from_params 예제들, allennlp.commands.make_vocab.make_vocab_from_params Python 예제들

예제 #1

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3

예제 #2

0

파일 보기

    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = vocab_path

        make_vocab_from_params(self.params)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            '.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs',
            'snakes'
        ]

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #3

0

파일 보기

    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3

예제 #4

0

파일 보기

def cache_vocab(params: Params, vocab_config_path: str = None):
    """
    Caches the vocabulary given in the Params to the filesystem. Useful for large datasets that are run repeatedly.
    :param params: the AllenNLP Params
    :param vocab_config_path: an optional config path for constructing the vocab
    """
    if "vocabulary" not in params or "directory_path" not in params["vocabulary"]:
        return

    vocab_path = params["vocabulary"]["directory_path"]

    if os.path.exists(vocab_path):
        if os.listdir(vocab_path):
            return

        # Remove empty vocabulary directory to make AllenNLP happy
        try:
            os.rmdir(vocab_path)
        except OSError:
            pass

    vocab_config_path = vocab_config_path if vocab_config_path else VOCAB_CONFIG_PATH

    params = merge_configs([params, Params.from_file(vocab_config_path)])
    params["vocabulary"].pop("directory_path", None)
    make_vocab_from_params(params, os.path.split(vocab_path)[0])

예제 #5

0

파일 보기

 def test_make_vocab_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params, self.TEST_DIR)

예제 #6

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

 def test_make_vocab_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params, self.TEST_DIR)

예제 #7

0

파일 보기

    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["directory_path"] = existing_vocab_path
        self.params["vocabulary"]["extend"] = True
        self.params["vocabulary"]["min_count"] = {"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"

        tokens.sort()
        assert tokens == [
            ".",
            "@@UNKNOWN@@",
            "animals",
            "are",
            "some_weird_token_1",
            "some_weird_token_2",
        ]

        with open(extended_vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]

예제 #8

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #9

0

파일 보기

파일: make_vocab_test.py 프로젝트: ryan-leung/ml_monorepo

    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / u'existing'
        extended_serialization_dir = self.TEST_DIR / u'extended'
        existing_vocab_path = existing_serialization_dir / u'vocabulary'
        extended_vocab_path = extended_serialization_dir / u'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace(u'some_weird_token_1',
                                     namespace=u'tokens')
        vocab.add_token_to_namespace(u'some_weird_token_2',
                                     namespace=u'tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params[u'vocabulary'] = {}
        self.params[u'vocabulary'][u'directory_path'] = existing_vocab_path
        self.params[u'vocabulary'][u'extend'] = True
        self.params[u'vocabulary'][u'min_count'] = {u"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(extended_vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == u'@@UNKNOWN@@'
        assert tokens[1] == u'some_weird_token_1'
        assert tokens[2] == u'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'some_weird_token_1',
            u'some_weird_token_2'
        ]

        with open(extended_vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']

예제 #10

0

파일 보기

파일: make_vocab_test.py 프로젝트: Jordan-Sauchuk/allennlp

    def test_make_vocab_makes_vocab(self):
        vocab_path = os.path.join(self.TEST_DIR, 'vocabulary')

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = vocab_path

        make_vocab_from_params(self.params)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(os.path.join(vocab_path, 'tokens.txt')) as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs', 'snakes']

        with open(os.path.join(vocab_path, 'labels.txt')) as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #11

0

파일 보기

파일: make_vocab_test.py 프로젝트: wgc20/GrailQA

    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['min_count'] = {"tokens" : 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #12

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['min_count'] = {"tokens" : 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #13

0

파일 보기

파일: make_vocab_test.py 프로젝트: ryan-leung/ml_monorepo

    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / u'vocabulary'

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == set(
            [u'labels.txt', u'non_padded_namespaces.txt', u'tokens.txt'])

        with open(vocab_path / u'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            u'.', u'@@UNKNOWN@@', u'animals', u'are', u'birds', u'cats',
            u'dogs', u'snakes'
        ]

        with open(vocab_path / u'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == [u'N', u'V']

예제 #14

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens" : 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are',
                          'some_weird_token_1', 'some_weird_token_2']

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']

예제 #15

0

파일 보기

    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["min_count"] = {"tokens": 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [".", "@@UNKNOWN@@", "animals", "are"]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]

예제 #16

0

파일 보기

    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            "labels.txt", "non_padded_namespaces.txt", "tokens.txt"
        }

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            ".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs",
            "snakes"
        ]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]

예제 #17

0

파일 보기

 def test_make_vocab_fails_without_vocabulary_key(self):
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params)

예제 #18

0

파일 보기

 def test_make_vocab_succeeds_without_vocabulary_key(self):
     make_vocab_from_params(self.params, self.TEST_DIR)

예제 #19

0

파일 보기

파일: make_vocab_test.py 프로젝트: apmoore1/allennlp

 def test_make_vocab_succeeds_without_vocabulary_key(self):
     make_vocab_from_params(self.params, self.TEST_DIR)

예제 #20

0

파일 보기

파일: tensor_bert.py 프로젝트: zeta1999/DiscoBERT

def build_vocab():
    from allennlp.commands.make_vocab import make_vocab_from_params

    jsonnet_file = os.path.join(root, 'configs/baseline_bert.jsonnet')
    params = Params.from_file(jsonnet_file)
    make_vocab_from_params(params, '/datadrive/bert_vocab')

예제 #21

0

파일 보기

파일: make_vocab_test.py 프로젝트: Jordan-Sauchuk/allennlp

 def test_make_vocab_fails_without_vocabulary_key(self):
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params)

예제 #22

0

파일 보기

파일: create_vocabs.py 프로젝트: TeMU-BSC/udify-transformers

    "Specify a list of treebanks to use; leave blank to default to all treebanks available"
)
parser.add_argument("--params_file",
                    default=None,
                    type=str,
                    help="The path to the vocab params")
args = parser.parse_args()

import_submodules("udify")

params_file = util.VOCAB_CONFIG_PATH if not args.params_file else args.params_file

treebanks = sorted(
    util.get_ud_treebank_files(args.dataset_dir, args.treebanks).items())
for treebank, (train_file, dev_file, test_file) in treebanks:
    logger.info(f"Creating vocabulary for treebank {treebank}")

    if not train_file:
        logger.info(f"No training data for {treebank}, skipping")
        continue

    overrides = json.dumps({
        "train_data_path": train_file,
        "validation_data_path": dev_file,
        "test_data_path": test_file
    })
    params = Params.from_file(params_file, overrides)
    output_file = os.path.join(args.output_dir, treebank)

    make_vocab_from_params(params, output_file)