Example #1
0
def generate_test_files(txt_path, test_processed_path):
    os.makedirs(test_processed_path, exist_ok=True)
    # Find abbrs, build abbr index
    print("Loading Test data...")
    test_collector = AbbrInstanceCollector(txt_path)
    abbr_index, test_no_mark = test_collector.generate_inverted_index()
    # save files
    txt_writer(test_no_mark, test_processed_path + '/test_no_mark.txt')
    abbr_index.save(test_processed_path + '/abbr_index_data.pkl')
Example #2
0
def generate_train_files(txt_path, train_processed_path):
    os.makedirs(train_processed_path, exist_ok=True)
    # Find abbrs, build abbr index
    print("Loading TRAIN data...")
    train_collector = AbbrInstanceCollector(txt_path)
    abbr_index, train_no_mark = train_collector.generate_inverted_index()
    # save files
    txt_writer(train_no_mark, train_processed_path + '/train_no_mark.txt')
    abbr_index.save(train_processed_path + '/abbr_index_data.pkl')

    print("Training Word2Vec...")
    model = gensim.models.Word2Vec(Corpus(train_no_mark), workers=30, min_count=1)
    model.save(train_processed_path + '/train.model')
Example #3
0
def abbr_job(abbr, abbr_index, abbr_idx_mapper, docs, content_dir,
             window_size):
    corpus = AbbrCorpus(abbr, abbr_index, docs, window_size=window_size)
    corpus_content = corpus.content_generator()

    dataset = []
    for global_instance_idx, doc_id, pos, content_pos, content, label in corpus_content:
        content.insert(content_pos, abbr)
        content = " ".join(content)
        dataset.append("__label__{} {}".format(label, content))

    # save vector to pickle file
    index = abbr_idx_mapper['abbr2idx'][abbr]
    txt_writer(dataset, content_dir + '%d.txt' % index)
Example #4
0
def process_annotated_data(txt_preprocessed_path, upmc_processed_path, train_ratio=0.8, n_jobs=30):
    os.makedirs(upmc_processed_path, exist_ok=True)
    upmc_txt_annotated = txt_reader(txt_preprocessed_path)
    # pre-processing
    upmc_txt = all_processor.process_texts(upmc_txt_annotated, n_jobs=n_jobs)
    # train/test split (80% train)
    random.shuffle(upmc_txt)
    num_instances = len(upmc_txt)
    train_idx = set(random.sample(range(num_instances), int(train_ratio*num_instances)))
    upmc_train_txt = []
    upmc_test_txt = []
    for idx, txt in enumerate(tqdm.tqdm(upmc_txt)):
        if idx in train_idx:
            upmc_train_txt.append(txt)
        else:
            upmc_test_txt.append(txt)
    # Write to file
    txt_writer(upmc_train_txt, upmc_processed_path+"/upmc_train.txt")
    txt_writer(upmc_test_txt, upmc_processed_path+"/upmc_test.txt")
Example #5
0
    all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory)
    all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid)

    # save sense inventory to json
    json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json")
    json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json")
    json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json")
    json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30)
    # tokenizing
    share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")
Example #6
0
                UMN_sense_cui_inventory[abbr][long_form] = None
    json_writer(UMN_sense_cui_inventory,
                umn_processed_path + "/UMN_sense_cui_inventory.json")

    #############################
    # Process UMN documents
    #############################

    umn_txt_marked = add_abbr_marker_umn(umn_txt)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_umn])

    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30)
    # tokenizing
    umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30)
    # add real annotations
    umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory,
                                           umn_txt_tokenized)
    # Filter trivial tokens and Remove repeat non-words
    umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated,
                                                      n_jobs=30)
    # Write to file
    txt_writer(umn_txt_filtered, umn_processed_path + "/umn_processed.txt")
Example #7
0
    txt_list_processed_sorted = sorted(txt_list_processed, key=operator.itemgetter(0))
    return [txt for _, txt in txt_list_processed_sorted]


if __name__ == '__main__':

    ######################################
    # Read texts from dataset
    ######################################
    # BASE_FOLDER = '/home/mengr/Project/wsd/wsd_data/'
    dataset_paths = DataSetPaths(environment='luoz3_x1')
    DATASET_PATH = dataset_paths.upmc_all_no_mark_txt
    OUTPUT_PATH = dataset_paths.upmc_all_no_mark_folder

    PATH_PROCESSED_INVENTORY_PKL = dataset_paths.sense_inventory_pkl

    # Get pickle generated from mimic_inventory.py
    inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL)
    inventory_rmapper = inventory['longform-abbr_cui']

    ######################################
    # Processing
    ######################################
    txt_list = list(open(DATASET_PATH, 'r').readlines())
    print("Loaded %d docs from %s" % (len(txt_list), DATASET_PATH))
    # Replace Long forms to abbrs
    mimic_txt_processed = longform_replacer(txt_list, inventory_rmapper, n_jobs=50)
    # Save to file
    txt_writer(mimic_txt_processed, OUTPUT_PATH+'train_no_mark_longform_replaced.txt')
Example #8
0
    # Read original sense inventory (only one word abbrs)
    MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list)

    # save sense inventory to json
    json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json")
    json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json")

    #############################
    # Process MSH documents (only one word abbrs)
    #############################
    msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])
    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10)
    # tokenizing
    msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10)
    # Filter trivial tokens and Remove repeat non-words
    msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10)
    # Write to file
    txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
Example #9
0
    ######################################
    # Read texts from dataset
    ######################################

    # File paths
    data_path = "/home/luoz3/wsd_data"
    upmc_all_path = data_path + "/upmc/batch1_4"
    upmc_all_processed_path = upmc_all_path + "/processed"
    os.makedirs(upmc_all_processed_path, exist_ok=True)

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    # Initialize processor and tokenizer
    token_filter = TextTokenFilter()
    processor = TextProcessor([
        white_space_remover,
        token_filter,
        repeat_non_word_remover,
    ])

    upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt")
    # pre-processing
    upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30)
    # Write to file
    txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt")

    print()
Example #10
0
        # read file
        filename = 'processed_text_chunk_%s.json' % i
        print("-"*50)
        print("Start File for %s" % filename)
        mimic_txt = []
        mimic_present_senses = []

        if not os.path.exists(PATH_FOLDER+filename):
            continue

        for line in open(PATH_FOLDER+filename, "r"):
            obj = json.loads(line)
            text = obj['TEXT']
            present_senses = obj['present_senses']
            mimic_txt.append(text)
            mimic_present_senses.append(present_senses)

        # pre-processing
        mimic_txt = processor.process_texts(mimic_txt, n_jobs=30)
        # Replace Long forms to abbrs
        mimic_txt_processed = longform_replacer(mimic_txt_filtered, mimic_present_senses, inventory_rmapper, n_jobs=16)
        # tokenizing
        mimic_txt_tokenized = toknizer.process_texts(mimic_txt, n_jobs=40)
        # Filter trivial tokens
        mimic_txt_filtered = filter_processor.process_texts(mimic_txt_tokenized, n_jobs=40)
        # Remove repeat non-words
        mimic_txt_processed = remove_repeat_processor.process_texts(mimic_txt_processed, n_jobs=40)
        # Save to file
        txt_writer(mimic_txt_processed, PATH_FOLDER_PROCESSED+'%s.txt' % filename[:-5])
Example #11
0
    # save sense inventory to json
    json_writer(sense_inventory,
                dataset_processed_path + "/dataset_sense_inventory.json")

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    dataset_txt_annotated = add_annotation_dataset(sense_inventory,
                                                   dataset_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30)
    # tokenizing
    dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    dataset_txt_filtered = filter_processor.process_texts(
        dataset_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(dataset_txt_filtered,
               dataset_processed_path + "/dataset_processed.txt")