Ejemplo n.º 1
0
def main(argv=sys.argv):
    ''' NeuroNER main method

    Args:
        parameters_filepath the path to the parameters file
        output_folder the path to the output folder
    '''
    arguments = parse_arguments(argv[1:])

    # fetch data and models from the package
    if arguments['fetch_data'] or arguments['fetch_trained_model']:

        if arguments['fetch_data']:
            neuromodel.fetch_data(arguments['fetch_data'])
        if arguments['fetch_trained_model']:
            neuromodel.fetch_model(arguments['fetch_trained_model'])

        msg = """When the fetch_data and fetch_trained_model arguments are specified, other
            arguments are ignored. Remove these arguments to train or apply a model."""
        print(msg)
        sys.exit(0)

    # create the model
    nn = neuromodel.NeuroNER(**arguments)
    nn.fit()
    nn.close()
def run_ner(issues_path, model_path, embedding_file):
    """ Use pretrained LSTM model + Glove for NER """
    #spacy.load('en')
    text_folder = issues_path
    out_folder = './output'
    ner_folder = out_folder + '/ner'
    if os.path.isdir(ner_folder):
        shutil.rmtree(ner_folder)
    else:
        os.makedirs(ner_folder)
    predict_folder = ner_folder + '/' + 'deploy'
    if not os.path.isdir(predict_folder):
        os.makedirs(predict_folder)
    src_files = os.listdir(text_folder)
    print(src_files)
    for file_name in src_files:
        full_file_name = os.path.join(text_folder, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, predict_folder)
    nn = neuromodel.NeuroNER(
        train_model=False,
        use_pretrained_model=True,
        dataset_text_folder=ner_folder,
        pretrained_model_folder=model_path,
        token_pretrained_embedding_filepath=embedding_file,
        output_folder=ner_folder + '/annotation')
    nn.fit()
Ejemplo n.º 3
0
    def test_ProvideOutputDir_CorrectlyOutputsToDir(self):
        """
        Sanity test to check if all proper model output files are created in the output folder
        """
        nn = neuromodel.NeuroNER(output_folder=self.outputFolder,
                                 parameters_filepath=self.test_param_file)
        nn.fit()

        # find the newest dir, from: http://stackoverflow.com/questions/2014554/find-the-newest-folder-in-a-directory-in-python
        run_outputdir = max([
            os.path.join(self.outputFolder, d)
            for d in os.listdir(self.outputFolder)
        ],
                            key=os.path.getmtime)

        # assert the model has been written to files
        self.assertTrue(
            os.path.isfile(os.path.join(run_outputdir, 'model', 'checkpoint')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model', 'dataset.pickle')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'model_00001.ckpt.data-00000-of-00001')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'model_00001.ckpt.index')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model', 'model_00001.ckpt.meta')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'model_00002.ckpt.data-00000-of-00001')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'model_00002.ckpt.index')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model', 'model_00002.ckpt.meta')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model', 'parameters.ini')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'projector_config.pbtxt')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'tensorboard_metadata_characters.tsv')))
        self.assertTrue(
            os.path.isfile(
                os.path.join(run_outputdir, 'model',
                             'tensorboard_metadata_tokens.tsv')))
Ejemplo n.º 4
0
def entity_detect(sentence):
    # print("Building model")
    with HiddenPrints():
        neuromodel.fetch_data(dataset)
        neuromodel.fetch_model(model)
        nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)

    # print("predicting")
    entities = nn.predict(sentence)
    return entities
Ejemplo n.º 5
0
def real_main():
  if FLAGS.dataset_text_folder:
    model_flags = {
        'dataset_text_folder':
            FLAGS.dataset_text_folder,
        'output_folder':
            FLAGS.output_folder,
        'train_model':
            FLAGS.train and not FLAGS.eval,
        'use_pretrained_model':
            not FLAGS.train and FLAGS.eval,
        'pretrained_model_folder':
            FLAGS.pretrained_model_folder
            and os.path.join('./trained_models', FLAGS.pretrained_model_folder),
        'recall_inference_bias':
            FLAGS.recall_inference_bias,
        'token_pretrained_embedding_filepath':
            './data/word_vectors/glove.{dim_length}B.{dim_width}d.txt'.format(
                dim_length={
                    '100': '6',
                    '300': '840'
                }[FLAGS.token_embedding_dimension],
                dim_width=FLAGS.token_embedding_dimension),
        'token_embedding_dimension':
            int(FLAGS.token_embedding_dimension),
        'token_lstm_hidden_state_dimension':
            int(FLAGS.token_embedding_dimension),
        'number_of_cpu_threads':
            FLAGS.threads_tf,
        'number_of_cpu_threads_prediction':
            FLAGS.threads_prediction,
    }
    model_flags = {k: v for k, v in model_flags.items() if v is not None}
  else:
    model_flags = {}
  nn = neuromodel.NeuroNER(**model_flags)
  if FLAGS.fit:
    nn.fit()
  if FLAGS.shell:
    IPython.start_ipython(argv=[], user_ns=dict(globals(), **locals()))
Ejemplo n.º 6
0
def extract_by_neuroner(file_path):
    nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)
    df = pd.read_pickle(file_path)
    neuroner_infos = []
    txt_without_ners = []
    for preprocessed_text in df['tokenized_body']:
        neuroner_result = [None] * len(ner_list)
        ner_list = nn.predict(preprocessed_text)
        txt = ''
        oldend = 0
        for i, ner_info in enumerate(ner_list):
            start = ner_info['start']
            txt += preprocessed_text[oldend:start]
            oldend = ner_info['end']
            neuroner_result[i] = ner_info['text']
        txt += preprocessed_text[oldend:]
        neuroner_infos.append(neuroner_result)
        txt_without_ners.append(txt)
        # make txt with NER text removed, and return list of ner tokens.
    df['neuroner_list'] = neuroner_infos
    df['neuroner_body'] = txt_without_ners
    df.to_pickle('neuroner_result.pkl')
def extract_by_neuroner(file_path):
    nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)
    df = pd.read_pickle(file_path)
    neuroner_infos = []
    txt_without_ners = []
    for preprocessed_text in df[' body']:
        try:
            ner_list = nn.predict(preprocessed_text)
        except FileNotFoundError:
            res = extract_ner(ne_chunk(pos_tag(preprocessed_text),
                                       binary=True))
            word_list = res['single_word']
            multi_word = res['multi_word']
            for m_w in multi_word:
                w = " ".join(m_w)
                word_list.append(w)
            neuroner_infos.append(word_list)
            tt = preprocessed_text
            for ner in word_list:
                tt = tt.replace(ner, '')
            txt_without_ners.append(tt)
            continue
        neuroner_result = [None] * len(ner_list)
        txt = ''
        oldend = 0
        for i, ner_info in enumerate(ner_list):
            start = ner_info['start']
            txt += preprocessed_text[oldend:start]
            oldend = ner_info['end']
            neuroner_result[i] = ner_info['text']
        txt += preprocessed_text[oldend:]
        neuroner_infos.append(neuroner_result)
        txt_without_ners.append(txt)
        # make txt with NER text removed, and return list of ner tokens.
    df['neuroner_list'] = neuroner_infos
    df['neuroner_body'] = txt_without_ners
    df['neuroner_tokenized'] = tokenize(txt_without_ners)
    df.to_pickle('../pickle/neuroner_result.pkl')
Ejemplo n.º 8
0
# 'conll2003'
# 'example_unannotated_texts'
# 'i2b2_2014_deid'

model = 'conll_2003_en'

# 'conll_2003_en'
# 'i2b2_2014_glove_spacy_bioes'
# 'i2b2_2014_glove_stanford_bioes'
# 'mimic_glove_spacy_bioes'
# 'mimic_glove_stanford_bioes'
print("Building model")
with HiddenPrints():
    neuromodel.fetch_data(dataset)
    neuromodel.fetch_model(model)
    nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)


def entity_detect(sentence):
    print("predicting")
    with HiddenPrints():
        entity = nn.predict(sentence)
        entities = []
        for i in range(len(entity)):
            entities.append(entity[i]['text'])
    return entities


if __name__ == '__main__':
    with open("parsed_json_93.json", 'r') as load_f:
        rec_text = json.load(load_f)
    "spacylanguage": "vi_spacy_model",
    "tagging_format": "bioes",
    "token_embedding_dimension": 300,
    "token_lstm_hidden_state_dimension": 300,
    "token_pretrained_embedding_filepath":
    "../data/word_vectors/glove.6B.100d.txt",
    "tokenizer": "spacy",
    "train_model": 0,
    "use_character_lstm": 1,
    "use_crf": 1,
    "use_pretrained_model": 1,
    "verbose": 0
}

from neuroner import neuromodel
nn = neuromodel.NeuroNER(**arguments)
# nn.load("model.ckpt")
# nn.predict(text="Xin chào Bách Khoa lúc 11 giờ trưa") tính năng này k có

# graph = tf.Graph()
# with restored_graph.as_default():
#     with tf.Session() as sess:
#         tf.saved_model.loader.load(
#             sess,
#             [tag_constants.SERVING],
#             'path/to/your/location/',
#         )
#         batch_size_placeholder = graph.get_tensor_by_name('batch_size_placeholder:0')
#         features_placeholder = graph.get_tensor_by_name('features_placeholder:0')
#         labels_placeholder = graph.get_tensor_by_name('labels_placeholder:0')
#         prediction = restored_graph.get_tensor_by_name('dense/BiasAdd:0')
Ejemplo n.º 10
0
#nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True, \
#                         dataset_text_folder='./data/example_unannotated_texts', \
#                         pretrained_model_folder='./trained_models/conll_2003_en')
#
#nn.fit()
#nn.close()

dir_list = ['../20news_data/20news_raw','../reuters_data/reuters_raw']
for data_folder in dir_list:
    starttime = datetime.datetime.now()
    tf.reset_default_graph()
    print('*************************start', data_folder, '*******************************')
    output_folder = './output'
    # dataset_text_folder参数为输入文件夹,必须在该文件夹下新建“deploy”文件夹,在deploy文件夹下放入输入文档的的文本文件,可以放多个文件
    nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True, \
                             dataset_text_folder=data_folder,\
                             pretrained_model_folder='./trained_models/conll_2003_en',\
                             output_folder=output_folder)
    nn.fit()
    nn.close()
    
    # 由于中间文件占据存储空间过大,删除多余的中间文件      
    dir_name = data_folder.split('/')[-1]
    output_data_folder = os.path.join(output_folder, dir_name)
    file_origin_path = os.path.join(output_data_folder,'000_deploy.txt')
    # 输出文件路径
    file_new_path = os.path.join(output_folder, dir_name+".txt")
    shutil.move(file_origin_path, file_new_path)
    shutil.rmtree(output_data_folder)
    endtime = datetime.datetime.now()
    print('Totol running for ', (endtime - starttime).seconds, ' seconds.')
    print('*************************end', data_folder, '*******************************')