def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the intent extraction dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model_info.dat", IntentExtractionApi.pretrained_model_info, ) download_unlicensed_file( "https://s3-us-west-2.amazonaws.com/nlp-architect-data" "/models/intent/", "model.h5", IntentExtractionApi.pretrained_model, ) print("Done.")
def load_parser(chunker): # load spacy parser logger.info("loading spacy. chunker=%s", chunker) if "nlp_arch" in chunker: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner", "parser"]).parser parser.add_pipe(parser.create_pipe("sentencizer"), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( "The pre-trained model to be downloaded for NLP Architect" " word chunker model is licensed under Apache 2.0") download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model="en_core_web_sm", disable=["textcat", "ner"]).parser logger.info("spacy loaded") return parser
def load_parser(chunker): # load spacy parser logger.info('loading spacy. chunker=%s', chunker) if 'nlp_arch' in chunker: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser parser.add_pipe(parser.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_path, chunker_model_file) _path_to_params = path.join(chunker_path, chunker_model_dat_file) if not path.exists(chunker_path): makedirs(chunker_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect' ' word chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: parser = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') return parser
def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt('mrc_data', 'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc' '/mrc_data.zip', self.data_dir) license_prompt('mrc_model', 'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/mrc' '/mrc_model.zip', self.model_dir) data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip') model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip') makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/mrc/', 'mrc_data.zip', data_zipfile) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/mrc/', 'mrc_model.zip', model_zipfile) with zipfile.ZipFile(data_zipfile) as data_zip_ref: data_zip_ref.extractall(self.data_dir) with zipfile.ZipFile(model_zipfile) as model_zip_ref: model_zip_ref.extractall(self.model_dir)
def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the NER dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license") makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/ner/", "model_v4.h5", self.pretrained_model, ) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/ner/", "model_info_v4.dat", self.pretrained_model_info, ) print("Done.")
def _download_pretrained_rerank_model(rerank_model_full_path): rerank_model_dir = path.dirname(rerank_model_full_path) if not path.isfile(rerank_model_full_path): makedirs(rerank_model_dir, exist_ok=True) print("dowloading pre-trained reranking model..") download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/' 'absa/', 'rerank_model.h5', rerank_model_full_path) return rerank_model_full_path
def _download_pretrained_model(): """Downloads the pre-trained BIST model if non-existent.""" if not path.isfile(path.join(SpacyBISTParser.dir, 'bist-pretrained', 'bist.model')): print('Downloading pre-trained BIST model...') download_unlicensed_file('https://s3-us-west-1.amazonaws.com/nervana-modelzoo/parse/', 'bist-pretrained.zip', path.join(SpacyBISTParser.dir, 'bist-pretrained.zip')) print('Unzipping...') zip_file = path.join(SpacyBISTParser.dir, 'bist-pretrained.zip') unzip_file(zip_file, outpath=SpacyBISTParser.dir) remove(zip_file) print('Done.')
def _download_pretrained_rerank_model(rerank_model_full_path): rerank_model_dir = path.dirname(rerank_model_full_path) if not path.isfile(rerank_model_full_path): makedirs(rerank_model_dir, exist_ok=True) print("dowloading pre-trained reranking model..") download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net/models/" "absa/", "rerank_model.h5", rerank_model_full_path, ) return rerank_model_full_path
def _download_pretrained_model(): """Downloads the pre-trained BIST model if non-existent.""" if not path.isfile(SpacyBISTParser.dir / 'bist.model'): print('Downloading pre-trained BIST model...') zip_path = SpacyBISTParser.dir / 'bist-pretrained.zip' makedirs(SpacyBISTParser.dir, exist_ok=True) download_unlicensed_file( 'https://s3-us-west-2.amazonaws.com/nlp-architect-data/models/dep_parse/', 'bist-pretrained.zip', zip_path) print('Unzipping...') uncompress_file(zip_path, outpath=str(SpacyBISTParser.dir)) remove(zip_path) print('Done.')
def _download_pretrained_model(): """Downloads the pre-trained BIST model if non-existent.""" dir_path = path.join(SpacyBISTParser.dir, 'bist-pretrained') if not path.isfile(path.join(dir_path, 'bist.model')): print('Downloading pre-trained BIST model...') zip_path = path.join(SpacyBISTParser.dir, 'bist-pretrained.zip') download_unlicensed_file('https://s3-us-west-1.amazonaws.com/nervana-modelzoo/parse/', 'bist-pretrained.zip', zip_path) makedirs(dir_path, exist_ok=True) print('Unzipping...') uncompress_file(zip_path, outpath=dir_path) remove(zip_path) print('Done.')
def _download_pretrained_model(): """Downloads the pre-trained BIST model if non-existent.""" if not path.isfile(SpacyBISTParser.dir / "bist.model"): print("Downloading pre-trained BIST model...") zip_path = SpacyBISTParser.dir / "bist-pretrained.zip" makedirs(SpacyBISTParser.dir, exist_ok=True) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net/models/dep_parse/", "bist-pretrained.zip", zip_path, ) print("Unzipping...") uncompress_file(zip_path, outpath=str(SpacyBISTParser.dir)) remove(zip_path) print("Done.")
def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt( "mrc_data", "https://d2zs9tzlek599f.cloudfront.net/models/mrc" "/mrc_data.zip", self.data_dir, ) license_prompt( "mrc_model", "https://d2zs9tzlek599f.cloudfront.net/models/mrc" "/mrc_model.zip", self.model_dir, ) data_zipfile = os.path.join(self.data_dir, "mrc_data.zip") model_zipfile = os.path.join(self.model_dir, "mrc_model.zip") makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/mrc/", "mrc_data.zip", data_zipfile, ) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/mrc/", "mrc_model.zip", model_zipfile, ) with zipfile.ZipFile(data_zipfile) as data_zip_ref: data_zip_ref.extractall(self.data_dir) with zipfile.ZipFile(model_zipfile) as model_zip_ref: model_zip_ref.extractall(self.model_dir)
def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" dir_path = path.join(self.dir, 'ner-pretrained') model_exists = path.isfile(path.join(dir_path, 'model.h5')) model_info_exists = path.isfile(path.join(dir_path, 'model_info.dat')) if (not model_exists or not model_info_exists): print('The pre-trained models to be downloaded for the NER dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(dir_path, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file('http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model.h5', self.model_path) download_unlicensed_file('http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/', 'model_info.dat', self.model_info_path) print('Done.')
def _download_pretrained_model(self, prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_exists = path.isfile(self.pretrained_model) model_info_exists = path.isfile(self.pretrained_model_info) if not model_exists or not model_info_exists: print('The pre-trained models to be downloaded for the NER dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(self.model_dir, exist_ok=True) if prompt is True: agreed = NerApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/ner/', 'model_v4.h5', self.pretrained_model) download_unlicensed_file('https://s3-us-west-2.amazonaws.com/nlp-architect-data' '/models/ner/', 'model_info_v4.dat', self.pretrained_model_info) print('Done.')
def __init__(self, parser=None): if parser is None: self.nlp = SpacyInstance( disable=['ner', 'parser', 'vectors', 'textcat']).parser else: self.nlp = parser self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True) _path_to_model = path.join(chunker_local_path, chunker_model_file) if not path.exists(chunker_local_path): makedirs(chunker_local_path) if not path.exists(_path_to_model): logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(chunker_local_path, chunker_model_dat_file) if not path.exists(_path_to_params): download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)
def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile( IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( 'The pre-trained models to be downloaded for the intent extraction dataset ' 'are licensed under Apache 2.0. By downloading, you accept the terms ' 'and conditions provided by the license') makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model_info.dat', IntentExtractionApi.pretrained_model_info) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/', 'model.h5', IntentExtractionApi.pretrained_model) print('Done.')
def download_model(self): # Validate contents of data_path folder: data_path = self.data_path download = False for file_name in self.file_name_dict.values(): if not os.path.exists(os.path.join(data_path, file_name)): # prompt download = True print("The following required file is missing :", file_name) if download is True: if self.prompt is True: license_prompt( 'mrc_data', 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_data.zip', self.data_dir) license_prompt( 'mrc_model', 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/mrc_model.zip', self.model_dir) data_zipfile = os.path.join(self.data_dir, 'mrc_data.zip') model_zipfile = os.path.join(self.model_dir, 'mrc_model.zip') makedirs(self.data_dir, exist_ok=True) makedirs(self.model_dir, exist_ok=True) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/', 'mrc_data.zip', data_zipfile) download_unlicensed_file( 'http://nervana-modelzoo.s3.amazonaws.com/NLP/mrc/', 'mrc_model.zip', model_zipfile) data_zip_ref = zipfile.ZipFile(data_zipfile, 'r') data_zip_ref.extractall(self.data_dir) data_zip_ref.close() model_zip_ref = zipfile.ZipFile(model_zipfile, 'r') model_zip_ref.extractall(self.model_dir) model_zip_ref.close()
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = "dialog-babi-candidates.txt" self.kb_filename = "dialog-babi-kb-all.txt" self.cands_mat_filename = "babi-cands-with-matchtype_{}.npy" self.vocab_filename = "dialog-babi-vocab-task{}".format( self.task + 1) + "_matchtype{}.pkl".format(self.use_match_type) else: self.candidate_answer_filename = "dialog-babi-task6-dstc2-candidates.txt" self.kb_filename = "dialog-babi-task6-dstc2-kb.txt" self.cands_mat_filename = "dstc2-cands-with-matchtype_{}.npy" self.vocab_filename = "dstc2-vocab-task{}_matchtype{}.pkl".format( self.task + 1, self.use_match_type) self.vectorized_filename = "vectorized_task{}.pkl".format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append(self.path, "", self.filename) if not os.path.exists(filepath): if (license_prompt("bAbI-dialog", "https://research.fb.com/downloads/babi/", self.path) is False): sys.exit(0) download_unlicensed_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split(".")[0] self.candidate_answer_filename = self.babi_dir_name + "/" + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + "/" + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + "/" + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + "/" + self.vocab_filename self.vectorized_filename = self.babi_dir_name + "/" + self.vectorized_filename task_name = self.babi_dir_name + "/" + self.tasks[self.task] + "{}.txt" train_file = os.path.join(self.workdir, task_name.format("trn")) dev_file = os.path.join(self.workdir, task_name.format("dev")) test_file_postfix = "tst-OOV" if self.oov else "tst" test_file = os.path.join(self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, "r:gz") as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
def download(url, filename, local_path): if not os.path.exists(local_path): download_unlicensed_file(url, filename, local_path)
def load_data(self): """ Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded. Returns: tuple: training and test filenames are returned """ if self.task < 5: self.candidate_answer_filename = 'dialog-babi-candidates.txt' self.kb_filename = 'dialog-babi-kb-all.txt' self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy' self.vocab_filename = 'dialog-babi-vocab-task{}'.format(self.task + 1) +\ '_matchtype{}.pkl'.format(self.use_match_type) else: self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt' self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt' self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy' self.vocab_filename = 'dstc2-vocab-task{}_matchtype{}.pkl'.format( self.task + 1, self.use_match_type) self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1) self.data_dict = {} self.vocab = None self.workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): if license_prompt('bAbI-dialog', 'https://research.fb.com/downloads/babi/', self.path) is False: sys.exit(0) download_unlicensed_file(self.url, self.filename, filepath, self.size) self.babi_dir_name = self.filename.split('.')[0] self.candidate_answer_filename = self.babi_dir_name + \ '/' + self.candidate_answer_filename self.kb_filename = self.babi_dir_name + '/' + self.kb_filename self.cands_mat_filename = os.path.join( self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename) self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt' train_file = os.path.join(self.workdir, task_name.format('trn')) dev_file = os.path.join(self.workdir, task_name.format('dev')) test_file_postfix = 'tst-OOV' if self.oov else 'tst' test_file = os.path.join(self.workdir, task_name.format(test_file_postfix)) cand_file = os.path.join(self.workdir, self.candidate_answer_filename) kb_file = os.path.join(self.workdir, self.kb_filename) vocab_file = os.path.join(self.workdir, self.vocab_filename) vectorized_file = os.path.join(self.workdir, self.vectorized_filename) if (os.path.exists(train_file) is False or os.path.exists(dev_file) is False or os.path.exists(test_file) is False or os.path.exists(cand_file) is False): with tarfile.open(filepath, 'r:gz') as f: f.extractall(self.workdir) return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
errors='ignore') else: corpus_file = open(args.corpus, 'r', encoding='utf8', errors='ignore') with open(args.marked_corpus, 'w', encoding='utf8') as marked_corpus_file: # load spacy parser logger.info('loading spacy') if 'nlp_arch' in args.chunker: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner', 'parser']).parser nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) logger.info( 'The pre-trained model to be downloaded for NLP Architect word' ' chunker model is licensed under Apache 2.0') _path_to_model = path.join(cur_dir, chunker_model_file) download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model) _path_to_params = path.join(cur_dir, chunker_model_dat_file) download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params) logger.info('Done.') nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') num_lines = sum(1 for line in corpus_file) corpus_file.seek(0) logger.info('%i lines in corpus', num_lines) i = 0