Beispiel #1
0
 def __init__(self,
              train_processed_path,
              abbr_inventory_path,
              use_pretrain=False,
              use_softmax=False):
     """
     Initialize environment & model.
     """
     # Initialize processor and tokenizer
     self.pre_processor = TextProcessor(
         [white_space_remover_upmc, sub_deid_patterns_upmc])
     self.tokenizer = CoreNLPTokenizer()
     self.post_processor = TextProcessor(
         [AbbrDetector(abbr_inventory_path)])
     self.filter_processor = TextProcessor(
         [TextTokenFilter(), repeat_non_word_remover])
     # Load model
     train_path = train_processed_path + '/fasttext'
     if use_pretrain:
         model_path = train_path + '/model/pre_train'
     else:
         model_path = train_path + '/model'
     if use_softmax:
         model_file = model_path + '/all_softmax.bin'
     else:
         model_file = model_path + '/all.bin'
     self.model = load_model(model_file)
Beispiel #2
0
    all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory)
    all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid)

    # save sense inventory to json
    json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json")
    json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json")
    json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json")
    json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30)
    # tokenizing
    share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")