Python download_unlicensed_fileの例、nlp_architect.utils.io.download_unlicensed_file Pythonの例

コード例 #1

0

ファイルを表示

 def _download_pretrained_model(prompt=True):
     """Downloads the pre-trained BIST model if non-existent."""
     model_info_exists = path.isfile(
         IntentExtractionApi.pretrained_model_info)
     model_exists = path.isfile(IntentExtractionApi.pretrained_model)
     if not model_exists or not model_info_exists:
         print(
             "The pre-trained models to be downloaded for the intent extraction dataset "
             "are licensed under Apache 2.0. By downloading, you accept the terms "
             "and conditions provided by the license")
         makedirs(IntentExtractionApi.model_dir, exist_ok=True)
         if prompt is True:
             agreed = IntentExtractionApi._prompt()
             if agreed is False:
                 sys.exit(0)
         download_unlicensed_file(
             "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
             "/models/intent/",
             "model_info.dat",
             IntentExtractionApi.pretrained_model_info,
         )
         download_unlicensed_file(
             "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
             "/models/intent/",
             "model.h5",
             IntentExtractionApi.pretrained_model,
         )
         print("Done.")

コード例 #2

0

ファイルを表示

ファイル: prepare_data.py プロジェクト: yuansky/nlp-architect

def load_parser(chunker):
    # load spacy parser
    logger.info("loading spacy. chunker=%s", chunker)
    if "nlp_arch" in chunker:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner", "parser"]).parser
        parser.add_pipe(parser.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect"
                " word chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner"]).parser
    logger.info("spacy loaded")
    return parser

コード例 #3

0

ファイルを表示

ファイル: prepare_data.py プロジェクト: wangjs/nlp-architect

def load_parser(chunker):
    # load spacy parser
    logger.info('loading spacy. chunker=%s', chunker)
    if 'nlp_arch' in chunker:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner', 'parser']).parser
        parser.add_pipe(parser.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect'
                ' word chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner']).parser
    logger.info('spacy loaded')
    return parser

コード例 #4

0

ファイルを表示

ファイル: machine_comprehension_api.py プロジェクト: xiaming9880/nlp-architect

    def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt('mrc_data',
                               'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
                               '/mrc_data.zip',
                               self.data_dir)
                license_prompt('mrc_model',
                               'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc'
                               '/mrc_model.zip',
                               self.model_dir)
            data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip')
            model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip')
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/mrc/',
                                     'mrc_data.zip', data_zipfile)
            download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                     '/models/mrc/',
                                     'mrc_model.zip', model_zipfile)
            with zipfile.ZipFile(data_zipfile) as data_zip_ref:
                data_zip_ref.extractall(self.data_dir)
            with zipfile.ZipFile(model_zipfile) as model_zip_ref:
                model_zip_ref.extractall(self.model_dir)

コード例 #5

0

ファイルを表示

 def _download_pretrained_model(self, prompt=True):
     """Downloads the pre-trained BIST model if non-existent."""
     model_exists = path.isfile(self.pretrained_model)
     model_info_exists = path.isfile(self.pretrained_model_info)
     if not model_exists or not model_info_exists:
         print(
             "The pre-trained models to be downloaded for the NER dataset "
             "are licensed under Apache 2.0. By downloading, you accept the terms "
             "and conditions provided by the license")
         makedirs(self.model_dir, exist_ok=True)
         if prompt is True:
             agreed = NerApi._prompt()
             if agreed is False:
                 sys.exit(0)
         download_unlicensed_file(
             "https://d2zs9tzlek599f.cloudfront.net"
             "/models/ner/",
             "model_v4.h5",
             self.pretrained_model,
         )
         download_unlicensed_file(
             "https://d2zs9tzlek599f.cloudfront.net"
             "/models/ner/",
             "model_info_v4.dat",
             self.pretrained_model_info,
         )
         print("Done.")

コード例 #6

0

ファイルを表示

def _download_pretrained_rerank_model(rerank_model_full_path):
    rerank_model_dir = path.dirname(rerank_model_full_path)
    if not path.isfile(rerank_model_full_path):
        makedirs(rerank_model_dir, exist_ok=True)
        print("dowloading pre-trained reranking model..")
        download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/'
                                 'absa/', 'rerank_model.h5', rerank_model_full_path)
    return rerank_model_full_path

コード例 #7

0

ファイルを表示

ファイル: spacy_bist.py プロジェクト: Asteur/NervanaNlpApch

def _download_pretrained_model():
    """Downloads the pre-trained BIST model if non-existent."""
    if not path.isfile(path.join(SpacyBISTParser.dir, 'bist-pretrained', 'bist.model')):
        print('Downloading pre-trained BIST model...')
        download_unlicensed_file('https://s3-us-west-1.amazonaws.com/nervana-modelzoo/parse/',
                      'bist-pretrained.zip', path.join(SpacyBISTParser.dir, 'bist-pretrained.zip'))
        print('Unzipping...')
        zip_file = path.join(SpacyBISTParser.dir, 'bist-pretrained.zip')
        unzip_file(zip_file, outpath=SpacyBISTParser.dir)
        remove(zip_file)
        print('Done.')

コード例 #8

0

ファイルを表示

def _download_pretrained_rerank_model(rerank_model_full_path):
    rerank_model_dir = path.dirname(rerank_model_full_path)
    if not path.isfile(rerank_model_full_path):
        makedirs(rerank_model_dir, exist_ok=True)
        print("dowloading pre-trained reranking model..")
        download_unlicensed_file(
            "https://d2zs9tzlek599f.cloudfront.net/models/"
            "absa/",
            "rerank_model.h5",
            rerank_model_full_path,
        )
    return rerank_model_full_path

コード例 #9

0

ファイルを表示

def _download_pretrained_model():
    """Downloads the pre-trained BIST model if non-existent."""
    if not path.isfile(SpacyBISTParser.dir / 'bist.model'):
        print('Downloading pre-trained BIST model...')
        zip_path = SpacyBISTParser.dir / 'bist-pretrained.zip'
        makedirs(SpacyBISTParser.dir, exist_ok=True)
        download_unlicensed_file(
            'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/dep_parse/',
            'bist-pretrained.zip', zip_path)
        print('Unzipping...')
        uncompress_file(zip_path, outpath=str(SpacyBISTParser.dir))
        remove(zip_path)
        print('Done.')

コード例 #10

0

ファイルを表示

def _download_pretrained_model():
    """Downloads the pre-trained BIST model if non-existent."""
    dir_path = path.join(SpacyBISTParser.dir, 'bist-pretrained')
    if not path.isfile(path.join(dir_path, 'bist.model')):
        print('Downloading pre-trained BIST model...')
        zip_path = path.join(SpacyBISTParser.dir, 'bist-pretrained.zip')
        download_unlicensed_file('https://s3-us-west-1.amazonaws.com/nervana-modelzoo/parse/',
                                 'bist-pretrained.zip', zip_path)

        makedirs(dir_path, exist_ok=True)
        print('Unzipping...')
        uncompress_file(zip_path, outpath=dir_path)
        remove(zip_path)
        print('Done.')

コード例 #11

0

ファイルを表示

ファイル: spacy_bist.py プロジェクト: yyzreal/nlp-architect

def _download_pretrained_model():
    """Downloads the pre-trained BIST model if non-existent."""
    if not path.isfile(SpacyBISTParser.dir / "bist.model"):
        print("Downloading pre-trained BIST model...")
        zip_path = SpacyBISTParser.dir / "bist-pretrained.zip"
        makedirs(SpacyBISTParser.dir, exist_ok=True)
        download_unlicensed_file(
            "https://d2zs9tzlek599f.cloudfront.net/models/dep_parse/",
            "bist-pretrained.zip",
            zip_path,
        )
        print("Unzipping...")
        uncompress_file(zip_path, outpath=str(SpacyBISTParser.dir))
        remove(zip_path)
        print("Done.")

コード例 #12

0

ファイルを表示

    def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt(
                    "mrc_data",
                    "https://d2zs9tzlek599f.cloudfront.net/models/mrc"
                    "/mrc_data.zip",
                    self.data_dir,
                )
                license_prompt(
                    "mrc_model",
                    "https://d2zs9tzlek599f.cloudfront.net/models/mrc"
                    "/mrc_model.zip",
                    self.model_dir,
                )
            data_zipfile = os.path.join(self.data_dir, "mrc_data.zip")
            model_zipfile = os.path.join(self.model_dir, "mrc_model.zip")
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net"
                "/models/mrc/",
                "mrc_data.zip",
                data_zipfile,
            )
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net"
                "/models/mrc/",
                "mrc_model.zip",
                model_zipfile,
            )
            with zipfile.ZipFile(data_zipfile) as data_zip_ref:
                data_zip_ref.extractall(self.data_dir)
            with zipfile.ZipFile(model_zipfile) as model_zip_ref:
                model_zip_ref.extractall(self.model_dir)

コード例 #13

0

ファイルを表示

ファイル: ner_api.py プロジェクト: yehuangcn/rasa-nlp-architect

 def _download_pretrained_model(self, prompt=True):
     """Downloads the pre-trained BIST model if non-existent."""
     dir_path = path.join(self.dir, 'ner-pretrained')
     model_exists = path.isfile(path.join(dir_path, 'model.h5'))
     model_info_exists = path.isfile(path.join(dir_path, 'model_info.dat'))
     if (not model_exists or not model_info_exists):
         print('The pre-trained models to be downloaded for the NER dataset '
               'are licensed under Apache 2.0. By downloading, you accept the terms '
               'and conditions provided by the license')
         makedirs(dir_path, exist_ok=True)
         if prompt is True:
             agreed = NerApi._prompt()
             if agreed is False:
                 sys.exit(0)
         download_unlicensed_file('http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                                  'model.h5', self.model_path)
         download_unlicensed_file('http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                                  'model_info.dat', self.model_info_path)
         print('Done.')

コード例 #14

0

ファイルを表示

ファイル: ner_api.py プロジェクト: xiaming9880/nlp-architect

 def _download_pretrained_model(self, prompt=True):
     """Downloads the pre-trained BIST model if non-existent."""
     model_exists = path.isfile(self.pretrained_model)
     model_info_exists = path.isfile(self.pretrained_model_info)
     if not model_exists or not model_info_exists:
         print('The pre-trained models to be downloaded for the NER dataset '
               'are licensed under Apache 2.0. By downloading, you accept the terms '
               'and conditions provided by the license')
         makedirs(self.model_dir, exist_ok=True)
         if prompt is True:
             agreed = NerApi._prompt()
             if agreed is False:
                 sys.exit(0)
         download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                  '/models/ner/',
                                  'model_v4.h5', self.pretrained_model)
         download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data'
                                  '/models/ner/',
                                  'model_info_v4.dat', self.pretrained_model_info)
         print('Done.')

コード例 #15

0

ファイルを表示

ファイル: np_scorer.py プロジェクト: neuroph12/intel_nlp

    def __init__(self, parser=None):
        if parser is None:
            self.nlp = SpacyInstance(
                disable=['ner', 'parser', 'vectors', 'textcat']).parser
        else:
            self.nlp = parser

        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_local_path, chunker_model_file)
        if not path.exists(chunker_local_path):
            makedirs(chunker_local_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model)
        _path_to_params = path.join(chunker_local_path, chunker_model_dat_file)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params)
        self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)

コード例 #16

0

ファイルを表示

 def _download_pretrained_model(prompt=True):
     """Downloads the pre-trained BIST model if non-existent."""
     model_info_exists = path.isfile(
         IntentExtractionApi.pretrained_model_info)
     model_exists = path.isfile(IntentExtractionApi.pretrained_model)
     if not model_exists or not model_info_exists:
         print(
             'The pre-trained models to be downloaded for the intent extraction dataset '
             'are licensed under Apache 2.0. By downloading, you accept the terms '
             'and conditions provided by the license')
         makedirs(IntentExtractionApi.model_dir, exist_ok=True)
         if prompt is True:
             agreed = IntentExtractionApi._prompt()
             if agreed is False:
                 sys.exit(0)
         download_unlicensed_file(
             'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/',
             'model_info.dat', IntentExtractionApi.pretrained_model_info)
         download_unlicensed_file(
             'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/',
             'model.h5', IntentExtractionApi.pretrained_model)
         print('Done.')

コード例 #17

0

ファイルを表示

    def download_model(self):
        # Validate contents of data_path folder:
        data_path = self.data_path
        download = False
        for file_name in self.file_name_dict.values():
            if not os.path.exists(os.path.join(data_path, file_name)):
                # prompt
                download = True
                print("The following required file is missing :", file_name)

        if download is True:
            if self.prompt is True:
                license_prompt(
                    'mrc_data',
                    'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_data.zip',
                    self.data_dir)
                license_prompt(
                    'mrc_model',
                    'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_model.zip',
                    self.model_dir)
            data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip')
            model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip')
            makedirs(self.data_dir, exist_ok=True)
            makedirs(self.model_dir, exist_ok=True)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/',
                'mrc_data.zip', data_zipfile)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/',
                'mrc_model.zip', model_zipfile)
            data_zip_ref = zipfile.ZipFile(data_zipfile, 'r')
            data_zip_ref.extractall(self.data_dir)
            data_zip_ref.close()
            model_zip_ref = zipfile.ZipFile(model_zipfile, 'r')
            model_zip_ref.extractall(self.model_dir)
            model_zip_ref.close()

コード例 #18

0

ファイルを表示

ファイル: babi_dialog.py プロジェクト: yyzreal/nlp-architect

    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = "dialog-babi-candidates.txt"
            self.kb_filename = "dialog-babi-kb-all.txt"
            self.cands_mat_filename = "babi-cands-with-matchtype_{}.npy"
            self.vocab_filename = "dialog-babi-vocab-task{}".format(
                self.task + 1) + "_matchtype{}.pkl".format(self.use_match_type)
        else:
            self.candidate_answer_filename = "dialog-babi-task6-dstc2-candidates.txt"
            self.kb_filename = "dialog-babi-task6-dstc2-kb.txt"
            self.cands_mat_filename = "dstc2-cands-with-matchtype_{}.npy"
            self.vocab_filename = "dstc2-vocab-task{}_matchtype{}.pkl".format(
                self.task + 1, self.use_match_type)

        self.vectorized_filename = "vectorized_task{}.pkl".format(self.task +
                                                                  1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(self.path, "",
                                                   self.filename)
        if not os.path.exists(filepath):
            if (license_prompt("bAbI-dialog",
                               "https://research.fb.com/downloads/babi/",
                               self.path) is False):
                sys.exit(0)

            download_unlicensed_file(self.url, self.filename, filepath,
                                     self.size)

        self.babi_dir_name = self.filename.split(".")[0]

        self.candidate_answer_filename = self.babi_dir_name + "/" + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + "/" + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + "/" + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + "/" + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + "/" + self.vectorized_filename

        task_name = self.babi_dir_name + "/" + self.tasks[self.task] + "{}.txt"

        train_file = os.path.join(self.workdir, task_name.format("trn"))
        dev_file = os.path.join(self.workdir, task_name.format("dev"))
        test_file_postfix = "tst-OOV" if self.oov else "tst"
        test_file = os.path.join(self.workdir,
                                 task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
                or os.path.exists(dev_file) is False
                or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, "r:gz") as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file

コード例 #19

0

ファイルを表示

ファイル: test_spacy_np_annotator.py プロジェクト: xiaming9880/nlp-architect

def download(url, filename, local_path):
    if not os.path.exists(local_path):
        download_unlicensed_file(url, filename, local_path)

コード例 #20

0

ファイルを表示

    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = 'dialog-babi-candidates.txt'
            self.kb_filename = 'dialog-babi-kb-all.txt'
            self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dialog-babi-vocab-task{}'.format(self.task + 1) +\
                                  '_matchtype{}.pkl'.format(self.use_match_type)
        else:
            self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt'
            self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt'
            self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dstc2-vocab-task{}_matchtype{}.pkl'.format(
                self.task + 1, self.use_match_type)

        self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task +
                                                                  1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(self.path, '',
                                                   self.filename)
        if not os.path.exists(filepath):
            if license_prompt('bAbI-dialog',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            download_unlicensed_file(self.url, self.filename, filepath,
                                     self.size)

        self.babi_dir_name = self.filename.split('.')[0]

        self.candidate_answer_filename = self.babi_dir_name + \
            '/' + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + '/' + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename

        task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt'

        train_file = os.path.join(self.workdir, task_name.format('trn'))
        dev_file = os.path.join(self.workdir, task_name.format('dev'))
        test_file_postfix = 'tst-OOV' if self.oov else 'tst'
        test_file = os.path.join(self.workdir,
                                 task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
                or os.path.exists(dev_file) is False
                or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file

コード例 #21

0

ファイルを表示

                                errors='ignore')
    else:
        corpus_file = open(args.corpus, 'r', encoding='utf8', errors='ignore')

    with open(args.marked_corpus, 'w', encoding='utf8') as marked_corpus_file:
        # load spacy parser
        logger.info('loading spacy')
        if 'nlp_arch' in args.chunker:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner', 'parser']).parser
            nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            _path_to_model = path.join(cur_dir, chunker_model_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
            _path_to_params = path.join(cur_dir, chunker_model_dat_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
            logger.info('Done.')
            nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                         last=True)
        else:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner']).parser
        logger.info('spacy loaded')

        num_lines = sum(1 for line in corpus_file)
        corpus_file.seek(0)
        logger.info('%i lines in corpus', num_lines)
        i = 0