예제 #1
0
 def __init__(self,
              vectorizers,
              trim=False,
              truncate=False,
              mxlen=-1,
              **kwargs):
     super().__init__()
     self.vectorizers = vectorizers
     self.trim = trim
     self.truncate = truncate
     label_vectorizer_spec = kwargs.get('label_vectorizer', None)
     if label_vectorizer_spec:
         cache = label_vectorizer_spec.get("data_download_cache",
                                           os.path.expanduser("~/.bl-data"))
         if 'model_file' in label_vectorizer_spec:
             label_vectorizer_spec['model_file'] = SingleFileDownloader(
                 label_vectorizer_spec['model_file'], cache).download()
         if 'vocab_file' in label_vectorizer_spec:
             label_vectorizer_spec['vocab_file'] = SingleFileDownloader(
                 label_vectorizer_spec['vocab_file'], cache).download()
         if 'transform' in label_vectorizer_spec:
             label_vectorizer_spec['transform_fn'] = label_vectorizer_spec[
                 'transform']
         if 'transform_fn' in label_vectorizer_spec and isinstance(
                 label_vectorizer_spec['transform_fn'], str):
             label_vectorizer_spec['transform_fn'] = eval(
                 label_vectorizer_spec['transform_fn'])
         self.label_vectorizer = create_vectorizer(**label_vectorizer_spec)
     else:
         self.label_vectorizer = Dict1DVectorizer(fields='y', mxlen=mxlen)
     self.label2index = {
         Offsets.VALUES[Offsets.PAD]: Offsets.PAD,
         Offsets.VALUES[Offsets.GO]: Offsets.GO,
         Offsets.VALUES[Offsets.EOS]: Offsets.EOS
     }
예제 #2
0
    def _create_vectorizers(self, vecs_set=None):
        """Read the `features` section of the mead config.  This sections contains both embedding info and vectorizers
        Then use the vectorizer sub-section to instantiate the vectorizers and return them in a ``dict`` with name
        keyed off of the `features->name` and value of `vectorizer`

        :return: (``dict``) - A dictionary of the vectorizers keyed by feature name
        """
        self.vectorizers = {}

        features = self.config_params['features']
        assert_unique_feature_names([f['name'] for f in features])
        self.primary_key = features[0]['name']
        for feature in self.config_params['features']:
            key = feature['name']
            if '-' in key:
                raise ValueError('Feature names cannot contain "-". Found feature named "{}"'.format(key))
            if feature.get('primary', False) is True:
                self.primary_key = key

            vectorizer_section = feature.get('vectorizer', {})
            vecs_global_config = {'type': 'token1d'}
            if 'label' in vectorizer_section:
                vecs_global_config = vecs_set.get(vectorizer_section['label'])

            vectorizer_section = {**vecs_global_config, **vectorizer_section}
            vectorizer_section['data_download_cache'] = self.data_download_cache
            vec_file = vectorizer_section.get('file')
            if vec_file:
                vec_file = SingleFileDownloader(vec_file, self.data_download_cache).download()
                vectorizer_section['file'] = vec_file
            vectorizer_section['mxlen'] = vectorizer_section.get('mxlen', self.config_params.get('preproc', {}).get('mxlen', -1))
            vectorizer_section['mxwlen'] = vectorizer_section.get('mxwlen', self.config_params.get('preproc', {}).get('mxwlen', -1))
            if 'model_file' in vectorizer_section:
                vectorizer_section['model_file'] = SingleFileDownloader(vectorizer_section['model_file'], self.data_download_cache).download()
            if 'vocab_file' in vectorizer_section:
                vectorizer_section['vocab_file'] = SingleFileDownloader(vectorizer_section['vocab_file'], self.data_download_cache).download()
            if 'transform' in vectorizer_section:
                vectorizer_section['transform_fn'] = vectorizer_section['transform']

            if 'transform_fn' in vectorizer_section and isinstance(vectorizer_section['transform_fn'], str):
                vectorizer_section['transform_fn'] = eval(vectorizer_section['transform_fn'])

            vectorizer = baseline.create_vectorizer(**vectorizer_section)
            self.vectorizers[key] = vectorizer
예제 #3
0
파일: reader.py 프로젝트: sagnik/baseline
    def __init__(self, vectorizers, trim=False, truncate=False, mxlen=-1, **kwargs):
        super().__init__()
        self.vectorizers = vectorizers
        self.trim = trim
        self.truncate = truncate
        label_vectorizer_spec_dict = kwargs.get('label_vectorizers', {'y': Dict1DVectorizer(fields='y', mxlen=mxlen)})
        self.label_vectorizers = {}
        self.label2index = {}
        for k, label_vectorizer_spec in label_vectorizer_spec_dict.items():
            if 'label' not in label_vectorizer_spec:
                label_vectorizer_spec['label'] = k
            cache = label_vectorizer_spec.get("data_download_cache", os.path.expanduser("~/.bl-data"))
            if 'model_file' in label_vectorizer_spec:
                label_vectorizer_spec['model_file'] = SingleFileDownloader(label_vectorizer_spec['model_file'], cache).download()
            if 'vocab_file' in label_vectorizer_spec:
                label_vectorizer_spec['vocab_file'] = SingleFileDownloader(label_vectorizer_spec['vocab_file'], cache).download()
            label_vectorizer = create_vectorizer(**label_vectorizer_spec)

            self.label_vectorizers[k] = label_vectorizer
            self.label2index[k] = {
                Offsets.VALUES[Offsets.PAD]: Offsets.PAD,
                Offsets.VALUES[Offsets.GO]: Offsets.GO,
                Offsets.VALUES[Offsets.EOS]: Offsets.EOS
            }