def main(argv=sys.argv): ''' NeuroNER main method Args: parameters_filepath the path to the parameters file output_folder the path to the output folder ''' arguments = parse_arguments(argv[1:]) # fetch data and models from the package if arguments['fetch_data'] or arguments['fetch_trained_model']: if arguments['fetch_data']: neuromodel.fetch_data(arguments['fetch_data']) if arguments['fetch_trained_model']: neuromodel.fetch_model(arguments['fetch_trained_model']) msg = """When the fetch_data and fetch_trained_model arguments are specified, other arguments are ignored. Remove these arguments to train or apply a model.""" print(msg) sys.exit(0) # create the model nn = neuromodel.NeuroNER(**arguments) nn.fit() nn.close()
def run_ner(issues_path, model_path, embedding_file): """ Use pretrained LSTM model + Glove for NER """ #spacy.load('en') text_folder = issues_path out_folder = './output' ner_folder = out_folder + '/ner' if os.path.isdir(ner_folder): shutil.rmtree(ner_folder) else: os.makedirs(ner_folder) predict_folder = ner_folder + '/' + 'deploy' if not os.path.isdir(predict_folder): os.makedirs(predict_folder) src_files = os.listdir(text_folder) print(src_files) for file_name in src_files: full_file_name = os.path.join(text_folder, file_name) if os.path.isfile(full_file_name): shutil.copy(full_file_name, predict_folder) nn = neuromodel.NeuroNER( train_model=False, use_pretrained_model=True, dataset_text_folder=ner_folder, pretrained_model_folder=model_path, token_pretrained_embedding_filepath=embedding_file, output_folder=ner_folder + '/annotation') nn.fit()
def test_ProvideOutputDir_CorrectlyOutputsToDir(self): """ Sanity test to check if all proper model output files are created in the output folder """ nn = neuromodel.NeuroNER(output_folder=self.outputFolder, parameters_filepath=self.test_param_file) nn.fit() # find the newest dir, from: http://stackoverflow.com/questions/2014554/find-the-newest-folder-in-a-directory-in-python run_outputdir = max([ os.path.join(self.outputFolder, d) for d in os.listdir(self.outputFolder) ], key=os.path.getmtime) # assert the model has been written to files self.assertTrue( os.path.isfile(os.path.join(run_outputdir, 'model', 'checkpoint'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'dataset.pickle'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00001.ckpt.data-00000-of-00001'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00001.ckpt.index'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00001.ckpt.meta'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00002.ckpt.data-00000-of-00001'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00002.ckpt.index'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'model_00002.ckpt.meta'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'parameters.ini'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'projector_config.pbtxt'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'tensorboard_metadata_characters.tsv'))) self.assertTrue( os.path.isfile( os.path.join(run_outputdir, 'model', 'tensorboard_metadata_tokens.tsv')))
def entity_detect(sentence): # print("Building model") with HiddenPrints(): neuromodel.fetch_data(dataset) neuromodel.fetch_model(model) nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True) # print("predicting") entities = nn.predict(sentence) return entities
def real_main(): if FLAGS.dataset_text_folder: model_flags = { 'dataset_text_folder': FLAGS.dataset_text_folder, 'output_folder': FLAGS.output_folder, 'train_model': FLAGS.train and not FLAGS.eval, 'use_pretrained_model': not FLAGS.train and FLAGS.eval, 'pretrained_model_folder': FLAGS.pretrained_model_folder and os.path.join('./trained_models', FLAGS.pretrained_model_folder), 'recall_inference_bias': FLAGS.recall_inference_bias, 'token_pretrained_embedding_filepath': './data/word_vectors/glove.{dim_length}B.{dim_width}d.txt'.format( dim_length={ '100': '6', '300': '840' }[FLAGS.token_embedding_dimension], dim_width=FLAGS.token_embedding_dimension), 'token_embedding_dimension': int(FLAGS.token_embedding_dimension), 'token_lstm_hidden_state_dimension': int(FLAGS.token_embedding_dimension), 'number_of_cpu_threads': FLAGS.threads_tf, 'number_of_cpu_threads_prediction': FLAGS.threads_prediction, } model_flags = {k: v for k, v in model_flags.items() if v is not None} else: model_flags = {} nn = neuromodel.NeuroNER(**model_flags) if FLAGS.fit: nn.fit() if FLAGS.shell: IPython.start_ipython(argv=[], user_ns=dict(globals(), **locals()))
def extract_by_neuroner(file_path): nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True) df = pd.read_pickle(file_path) neuroner_infos = [] txt_without_ners = [] for preprocessed_text in df['tokenized_body']: neuroner_result = [None] * len(ner_list) ner_list = nn.predict(preprocessed_text) txt = '' oldend = 0 for i, ner_info in enumerate(ner_list): start = ner_info['start'] txt += preprocessed_text[oldend:start] oldend = ner_info['end'] neuroner_result[i] = ner_info['text'] txt += preprocessed_text[oldend:] neuroner_infos.append(neuroner_result) txt_without_ners.append(txt) # make txt with NER text removed, and return list of ner tokens. df['neuroner_list'] = neuroner_infos df['neuroner_body'] = txt_without_ners df.to_pickle('neuroner_result.pkl')
def extract_by_neuroner(file_path): nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True) df = pd.read_pickle(file_path) neuroner_infos = [] txt_without_ners = [] for preprocessed_text in df[' body']: try: ner_list = nn.predict(preprocessed_text) except FileNotFoundError: res = extract_ner(ne_chunk(pos_tag(preprocessed_text), binary=True)) word_list = res['single_word'] multi_word = res['multi_word'] for m_w in multi_word: w = " ".join(m_w) word_list.append(w) neuroner_infos.append(word_list) tt = preprocessed_text for ner in word_list: tt = tt.replace(ner, '') txt_without_ners.append(tt) continue neuroner_result = [None] * len(ner_list) txt = '' oldend = 0 for i, ner_info in enumerate(ner_list): start = ner_info['start'] txt += preprocessed_text[oldend:start] oldend = ner_info['end'] neuroner_result[i] = ner_info['text'] txt += preprocessed_text[oldend:] neuroner_infos.append(neuroner_result) txt_without_ners.append(txt) # make txt with NER text removed, and return list of ner tokens. df['neuroner_list'] = neuroner_infos df['neuroner_body'] = txt_without_ners df['neuroner_tokenized'] = tokenize(txt_without_ners) df.to_pickle('../pickle/neuroner_result.pkl')
# 'conll2003' # 'example_unannotated_texts' # 'i2b2_2014_deid' model = 'conll_2003_en' # 'conll_2003_en' # 'i2b2_2014_glove_spacy_bioes' # 'i2b2_2014_glove_stanford_bioes' # 'mimic_glove_spacy_bioes' # 'mimic_glove_stanford_bioes' print("Building model") with HiddenPrints(): neuromodel.fetch_data(dataset) neuromodel.fetch_model(model) nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True) def entity_detect(sentence): print("predicting") with HiddenPrints(): entity = nn.predict(sentence) entities = [] for i in range(len(entity)): entities.append(entity[i]['text']) return entities if __name__ == '__main__': with open("parsed_json_93.json", 'r') as load_f: rec_text = json.load(load_f)
"spacylanguage": "vi_spacy_model", "tagging_format": "bioes", "token_embedding_dimension": 300, "token_lstm_hidden_state_dimension": 300, "token_pretrained_embedding_filepath": "../data/word_vectors/glove.6B.100d.txt", "tokenizer": "spacy", "train_model": 0, "use_character_lstm": 1, "use_crf": 1, "use_pretrained_model": 1, "verbose": 0 } from neuroner import neuromodel nn = neuromodel.NeuroNER(**arguments) # nn.load("model.ckpt") # nn.predict(text="Xin chào Bách Khoa lúc 11 giờ trưa") tính năng này k có # graph = tf.Graph() # with restored_graph.as_default(): # with tf.Session() as sess: # tf.saved_model.loader.load( # sess, # [tag_constants.SERVING], # 'path/to/your/location/', # ) # batch_size_placeholder = graph.get_tensor_by_name('batch_size_placeholder:0') # features_placeholder = graph.get_tensor_by_name('features_placeholder:0') # labels_placeholder = graph.get_tensor_by_name('labels_placeholder:0') # prediction = restored_graph.get_tensor_by_name('dense/BiasAdd:0')
#nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True, \ # dataset_text_folder='./data/example_unannotated_texts', \ # pretrained_model_folder='./trained_models/conll_2003_en') # #nn.fit() #nn.close() dir_list = ['../20news_data/20news_raw','../reuters_data/reuters_raw'] for data_folder in dir_list: starttime = datetime.datetime.now() tf.reset_default_graph() print('*************************start', data_folder, '*******************************') output_folder = './output' # dataset_text_folder参数为输入文件夹,必须在该文件夹下新建“deploy”文件夹,在deploy文件夹下放入输入文档的的文本文件,可以放多个文件 nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True, \ dataset_text_folder=data_folder,\ pretrained_model_folder='./trained_models/conll_2003_en',\ output_folder=output_folder) nn.fit() nn.close() # 由于中间文件占据存储空间过大,删除多余的中间文件 dir_name = data_folder.split('/')[-1] output_data_folder = os.path.join(output_folder, dir_name) file_origin_path = os.path.join(output_data_folder,'000_deploy.txt') # 输出文件路径 file_new_path = os.path.join(output_folder, dir_name+".txt") shutil.move(file_origin_path, file_new_path) shutil.rmtree(output_data_folder) endtime = datetime.datetime.now() print('Totol running for ', (endtime - starttime).seconds, ' seconds.') print('*************************end', data_folder, '*******************************')