コード例 #1
0
def main(args):
    data_path = args.data_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/cantemist_data/'
    vocab_path = args.vocab_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/output/'
    out_path = args.output_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/test_aval/'

    if not os.path.exists(out_path):
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Generates the dictionary of labels from the label correspondence file
    flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8')
    labels = flabels.readlines()
    flabels.close()

    #Dict with ECIE-O codes as keys
    dict_labels = {}
    for i in range(len(labels)):
        dict_labels[labels[i].split('=')[1]] = (
            labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', ''))

    #Reads dev data to fill part of the test files
    l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1')
    l_dev_labels_ori = copy.deepcopy(l_dev_labels)
    l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels)

    #Reads tst set data
    l_tst_aval_txt, l_tst_aval_labels = pu.read_test_set_files(data_path)
    l_tst_aval_labels_ori = copy.deepcopy(l_tst_aval_labels)
    l_tst_aval_labels = pu.convert_labels(l_tst_aval_labels, dict_labels)

    #Stemms the data
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('spanish')
    print('Stemming dev1 text...')
    l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer)
    print('Stemming test aval text...')
    l_stem_text_tst_aval = su.list_stemming(l_tst_aval_txt, stemmer)

    #Creates the Test aval files
    #It is necessary to split the articles and respective labels in 48 sets, each with 250 articles,
    #which is equal to the number of articles present in the test set of the trained X-Transformer models
    #The first 109 lines of each file correspond to text from the test&background set to classify.
    #The other 141 lines correspond to text from dev set1 that will be used to find best confidence threshold
    #for the predictions
    cnt = 1
    ini = 0
    fin = 109

    while cnt <= 48:
        l_chunk_txt = l_stem_text_tst_aval[ini:fin] + l_stem_text_dev[0:141]
        l_chunk_labels = l_tst_aval_labels[ini:fin] + l_dev_labels[0:141]
        l_chunk_labels_ori = l_tst_aval_labels_ori[ini:fin] + l_dev_labels_ori[
            0:141]

        pu.write_files(l_chunk_txt, l_chunk_labels, l_chunk_labels_ori,
                       out_path, 'test_' + str(cnt))

        ini = fin
        fin = fin + 109
        cnt += 1
コード例 #2
0
def main(args):
    data_path = args.input_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/cantemist_new/'
    out_path = args.output_path  #'output/'

    if not os.path.exists(out_path):
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Generates .tsv file with all unique labels present in the txt files
    pu.gen_vocab_tsv(data_path, out_path)

    #Reads the generated .tsv file
    ecie_data = pd.read_csv(out_path + 'cantemist_terms.tsv', sep='\t')

    #Stores the terms in spanish and the respective ECIE-O Code in lists
    l_codes = ecie_data['Code'].astype(str).values.tolist()
    l_terms = ecie_data['Terms'].astype(str).values.tolist()

    #Generates vocab and label_correspondence files and returns dict with label correspondence
    dict_labels = pu.gen_vocab(l_terms, l_codes, out_path)

    #Reads training data
    l_trn_txt, l_trn_labels = pu.read_files(data_path, 'trn')
    #creates a copy of the original labels for it is needed for X-Transformer
    l_trn_labels_ori = copy.deepcopy(l_trn_labels)
    #converts the labels to their corresponding numeric identifier
    l_trn_labels = pu.convert_labels(l_trn_labels, dict_labels)

    #Reads dev data
    #l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1')
    #it is using dev2 since dev1 has one unlabelled file, and that causes X-Transformer to fail.
    #if the unlabelled file is removed, then the tst_aval processing would have to be changed, since the
    #X-Transformer model would not have a test set with 250 documents.
    l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev2')
    l_dev_labels_ori = copy.deepcopy(l_dev_labels)
    l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels)

    #Reads extra dev data
    #it uses the dev1 files to crete a larger train set
    #The file that has no assigned labels is removed.
    l_extra_txt, l_extra_labels = pu.read_files(data_path, 'dev1')
    l_extra_txt.pop(212)  #text file with no assigned labels
    l_extra_labels.pop(212)  #text file with no assigned labels
    l_extra_labels_ori = copy.deepcopy(l_extra_labels)
    l_extra_labels = pu.convert_labels(l_extra_labels, dict_labels)

    #Stemms the data
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('spanish')
    print('Stemming trn text...')
    l_stem_text_trn = su.list_stemming(l_trn_txt, stemmer)
    print('Stemming dev text...')
    l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer)
    print('Stemming extra text...')
    l_stem_text_extra = su.list_stemming(l_extra_txt, stemmer)

    #Writes files
    pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path,
                   'train')
    pu.write_files(l_stem_text_dev, l_dev_labels, l_dev_labels_ori, out_path,
                   'test')

    #Joins the extra data to the train data
    for i, j, z in zip(l_stem_text_extra, l_extra_labels, l_extra_labels_ori):
        l_stem_text_trn.append(i)
        l_trn_labels.append(j)
        l_trn_labels_ori.append(z)
    #Writes larger train set
    pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path,
                   'train_750')
コード例 #3
0
def main(args):
    finput_folder = args.input_folder
    finput_mesh = args.input_mesh_file
    finput_bioasq = args.bioasq_file
    out_path = args.output_path
    dtype = args.dtype
    mer = args.mer
    n_cores = args.mer_cores
    
    assert os.path.exists(finput_folder), "Folder does not exist"
    assert os.path.exists(finput_mesh), "MeSH file/path doesn't exist"
    assert os.path.splitext(finput_mesh)[-1].lower() == '.txt', "MeSH input file isn't a \'.txt\' file. Txt file is required."
    assert dtype == 'txt' or dtype == 'json', "Invalid data type. Valid values: txt, json"
    
    if not os.path.exists(out_path): 
        logging.info('Creating path %s' % out_path)
        print('Creating path %s' % out_path)
        os.mkdir(out_path)
        
    with open(finput_mesh) as mesh_file: #MeSH_name_id_mapping.txt
        mesh_data = mesh_file.readlines()

        l_mesh_term, l_mesh_code = [], []
        for i in range(len(mesh_data)):
            l_mesh_term.append(mesh_data[i].split('=')[0])
            l_mesh_code.append(mesh_data[i].strip('\n').split('=')[1])
    
    #Generates vocab and label_correspondence files
    tfu.gen_vocab(l_mesh_term, l_mesh_code, out_path)
    
    #Generates dict with label correspondence {MeSH Term: (label number, MeSH Code)}
    dict_labels = gen_dict_corr(l_mesh_term, l_mesh_code)

    with open(finput_bioasq, 'r', encoding='utf-8') as bioasq_input:
        data = json.load(bioasq_input)
        df_bioasq = json_normalize(data['documents'])
    
    df_size = len(df_bioasq)

    l_mesh_bioasq = [0] * df_size
    l_abs_bioasq = df_bioasq['abstractText'].values.tolist()
    l_title_bioasq = df_bioasq['title'].values.tolist()     

    l_mesh, l_title, l_abs = [], [], []
    if dtype == 'json':            
        with open(finput_folder + 'bioasq_data_3.json', 'r', encoding='utf-8') as json_file:
            logging.info('Loading json file 3...')
            print('Loading json file 3...')
            data = json.load(json_file)
            df = json_normalize(data)
            
        df = df.dropna()
        
        #stores the values of the codes, abstracts and titles into different lists
        l_mesh = df['meshMajor'].values.tolist()
        l_abs = df['abstractText'].values.tolist()
        l_title = df['title'].values.tolist()
    
    else: #txt
        with open(finput_folder + 'bioasq_data_extra.txt', 'r', encoding='utf-8') as txt_file:
            logging.info('Loading txt file...')
            print('Loading txt file...')
            data = txt_file.readlines()
            for l in range(len(data)):
                aux = data[l].split('\t')
                l_mesh.append([aux[0]])
                l_title.append(aux[1])
                l_abs.append(aux[2])
    
            #Converts from string to list 
            for i in range(len(l_mesh)):
                l_mesh[i] = ast.literal_eval(l_mesh[i][0])

    logging.info('Converting labels...')
    print('Converting labels...')
    l_mesh = convert_labels(l_mesh, dict_labels)

    logging.info('Preparing data...')
    print('Preparing data...')
    CON_TEST_SIZE = 63732 #This value needs to change if the size of the test.txt file used to train the X-BERT model changes
    
    for i in range(0, CON_TEST_SIZE):
        if i < df_size:
            l_mesh_bioasq[i] = l_mesh_bioasq[i]
            l_abs_bioasq[i] = l_abs_bioasq[i].replace(',','').replace('\n','')
            l_title_bioasq[i] = l_title_bioasq[i].replace('\n','')
        else:
            l_mesh_bioasq.append(l_mesh[i])
            l_abs_bioasq.append(l_abs[i].replace(',','').replace('\n',''))
            l_title_bioasq.append(l_title[i].replace('\n',''))
    
    #Generate Stemmer
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('english')

    l_lists = [(l_abs_bioasq, l_title_bioasq, l_mesh_bioasq, out_path+'test', 'test')]
               
    for l in l_lists:            
        logging.info('Processing %s data...' % l[4])
        print('Processing %s data...' % l[4])
        l_stem_text = []
        
        if mer:
            l_mer = []
            logging.info('MERing using mesh_lex...')
            print('MERing using mesh_lex...')
            l_mer = mu.call_simple_mer(l[0], n_cores, 'meshlex')
    
            #appends to the titles the corresponding MER terms iddentified earlier
            for i in range(len(l[1])):
                l[1][i] = l[1][i] + ' ' + str(l_mer[i])
            
        logging.info('Stemming...')
        print('Stemming...')
        l_stem_text = su.list_stemming(l[1], stemmer)

        logging.info('Writing %s file' % l[3])
        print('Writing %s file' % l[3])
        tfu.write_file(l_stem_text, l[2], l[3])
def main(args):
    finput = args.input_file
    finput_decs = args.input_decs_file
    out_path = args.output_path
    xmlc_alg = args.xmlc_alg
    trn_rat = args.train_ratio
    tst_rat = args.test_ratio
    mer = args.mer
    lexicon = args.mer_lex
    n_cores = args.mer_cores

    #Checks if input files/paths exist
    assert os.path.exists(finput), "Input file/path doesn't exist"
    assert os.path.exists(finput_decs), "DeCS file/path doesn't exist"
    assert os.path.splitext(finput)[-1].lower(
    ) == '.json', "Input file isn't a \'.json\' file. Json file is required."
    assert os.path.splitext(finput_decs)[-1].lower(
    ) == '.tsv', "DeCS input file isn't a \'.tsv\' file. Tsv file is required."
    assert xmlc_alg == 'X-BERT' or xmlc_alg == 'X-Transformer', "Invalid XMLC algorithm. Valid values: X-BERT, X-Transformer."

    if not os.path.exists(out_path):
        logging.info('Creating path %s' % out_path)
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Reads the DeCS terms file and stores them on separate lists
    logging.info('Reading DeCS terms file \'%s\' ...' % finput_decs)
    print('Reading DeCS terms file \'%s\' ...' % finput_decs)
    decs_data = pd.read_csv(finput_decs, sep='\t')

    #Stores the terms in spanish and the respective DeCS Code in lists
    l_term_spanish = decs_data['Term_Spanish'].astype(str).values.tolist()
    l_decs_code = decs_data['#DeCS_code'].astype(str).values.tolist()

    #Generates vocab and label_correspondence files
    tfu.gen_vocab(l_term_spanish, l_decs_code, out_path)

    #Generates dict with label correspondence
    dict_labels = tfu.gen_dict_label_corr(l_term_spanish, l_decs_code)

    logging.info('Reading MESINESP data \'%s\' ...' % finput)
    print('Reading MESINESP data \'%s\'...' % finput)
    with open(finput, 'r', encoding='utf-8') as json_input:
        data = json.load(json_input)
        df_data = json_normalize(data)
    df_size = len(df_data)

    l_abs_mesinesp = df_data['abstractText'].values.tolist()
    l_title_mesinesp = df_data['title'].values.tolist()
    l_decs_mesinesp = df_data['decsCodes'].values.tolist()
    #l_decs_mesinesp = tfu.convert_labels(l_decs_mesinesp, dict_labels)
    l_decs_mesinesp, l_decs_names = tfu.convert_labels(l_decs_mesinesp,
                                                       dict_labels)

    #Checks if all titles have text. Otherwise, they are exchanged by the abstracts
    #so that MER doesn't break
    for i in range(df_size):
        if l_title_mesinesp[i] == None or len(l_title_mesinesp[i]) <= 0:
            if l_abs_mesinesp[i] != None and len(l_abs_mesinesp[i]) > 0:
                l_title_mesinesp[i] = l_abs_mesinesp[i]

    logging.info('Spliting the data into different sets...')
    print('Spliting the data into different sets...')
    trn_limit = int(df_size * trn_rat / 100)
    tst_limit = int(df_size * tst_rat / 100)

    l_train_decs, l_train_decs_names, l_train_abs, l_train_title,\
    l_test_decs, l_test_decs_names, l_test_abs, l_test_title,\
    l_valid_decs, l_valid_abs, l_valid_title = tfu.split_data(l_decs_mesinesp,\
                                                              l_abs_mesinesp,\
                                                              l_title_mesinesp,\
                                                              df_size,\
                                                              trn_limit, tst_limit,\
                                                              xmlc_alg,\
                                                              l_decs_names)

    #For titles
    if xmlc_alg == 'X-Transformer':
        l_lists = [(l_train_abs, l_train_title, l_train_decs,
                    out_path + 'train', 'train', l_train_decs_names),
                   (l_test_abs, l_test_title, l_test_decs, out_path + 'test',
                    'test', l_test_decs_names)]
    else:  #X-BERT
        l_lists = [(l_train_abs, l_train_title, l_train_decs,
                    out_path + 'train', 'train'),
                   (l_test_abs, l_test_title, l_test_decs, out_path + 'test',
                    'test'),
                   (l_valid_abs, l_valid_title, l_valid_decs,
                    out_path + 'valid', 'valid')]

    #Generate Stemmer
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('spanish')

    for l in l_lists:
        logging.info('Processing %s data...' % l[4])
        print('Processing %s data...' % l[4])
        l_stem_text = []

        if mer:
            l_mer = []
            logging.info('MERing using %s ...' % lexicon)
            print('MERing using %s ...' % lexicon)
            if lexicon == 'decslex':
                l_mer = mu.call_simple_mer(l[0], n_cores)
            else:
                l_mer = mu.call_custom_mer(l[0], lexicon, n_cores)

            #appends to the titles the corresponding MER terms iddentified earlier
            for i in range(len(l[1])):
                l[1][i] = l[1][i] + ' ' + str(l_mer[i])

        logging.info('Stemming...')
        print('Stemming...')
        l_stem_text = su.list_stemming(l[1], stemmer)

        logging.info('Writing %s file' % l[3])
        print('Writing %s file' % l[3])
        tfu.write_file(l_stem_text, l[2], l[3])

        if xmlc_alg == 'X-Transformer':
            logging.info('Writing %s raw file' % l[3])
            print('Writing %s raw file' % l[3])
            tfu.write_raw_files(l_stem_text, l[5], l[3])
コード例 #5
0
def main(args):
    finput_folder = args.input_folder
    finput_mesh = args.input_mesh_file
    out_path = args.output_path
    dtype = args.dtype
    trn_rat = args.trr
    tst_rat = args.tsr
    mer = args.mer
    n_cores = args.mer_cores

    assert os.path.exists(finput_folder), "Folder does not exist"
    assert os.path.exists(finput_mesh), "MeSH file/path doesn't exist"
    assert os.path.splitext(finput_mesh)[-1].lower(
    ) == '.txt', "MeSH input file isn't a \'.txt\' file. Txt file is required."
    assert dtype == 'txt' or dtype == 'json', "Invalid data type. Valid values: txt, json"

    if not os.path.exists(out_path):
        logging.info('Creating path %s' % out_path)
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    with open(finput_mesh) as mesh_file:  #MeSH_name_id_mapping.txt
        mesh_data = mesh_file.readlines()

        l_mesh_term, l_mesh_code = [], []
        for i in range(len(mesh_data)):
            l_mesh_term.append(mesh_data[i].split('=')[0])
            l_mesh_code.append(mesh_data[i].strip('\n').split('=')[1])

    #Generates vocab and label_correspondence files
    tfu.gen_vocab(l_mesh_term, l_mesh_code, out_path)

    #Generates dict with label correspondence {MeSH Term: (label number, MeSH Code)}
    dict_labels = gen_dict_corr(l_mesh_term, l_mesh_code)

    l_mesh, l_title, l_abs = [], [], []
    if dtype == 'json':
        with open(finput_folder + 'bioasq_data_1.json', 'r',
                  encoding='utf-8') as json_file:
            logging.info('Loading json file 1...')
            print('Loading json file 1...')
            data = json.load(json_file)
            df = json_normalize(data)

        with open(finput_folder + 'bioasq_data_2.json', 'r',
                  encoding='utf-8') as json_file:
            logging.info('Loading json file 2...')
            print('Loading json file 2...')
            data = json.load(json_file)
            df = df.append(json_normalize(data), ignore_index=True)

        df = df.dropna()

        #stores the values of the codes, abstracts and titles into different lists
        l_mesh = df['meshMajor'].values.tolist()
        l_abs = df['abstractText'].values.tolist()
        l_title = df['title'].values.tolist()

    else:  #txt
        with open(finput_folder + 'bioasq_data.txt', 'r',
                  encoding='utf-8') as txt_file:
            logging.info('Loading txt file...')
            print('Loading txt file...')
            data = txt_file.readlines()
            for l in range(len(data)):
                aux = data[l].split('\t')
                l_mesh.append([aux[0]])
                l_title.append(aux[1])
                l_abs.append(aux[2])

            #Converts from string to list
            for i in range(len(l_mesh)):
                l_mesh[i] = ast.literal_eval(l_mesh[i][0])

    logging.info('Converting labels...')
    print('Converting labels...')
    l_mesh = convert_labels(l_mesh, dict_labels)

    logging.info('Spliting the data into train, test and validation...')
    print('Spliting the data into train, test and validation...')
    CON_LIMIT = 318658
    trn_limit = (CON_LIMIT * trn_rat) / 100
    tst_limit = (CON_LIMIT * tst_rat) / 100

    l_train_mesh, _, l_train_abs, l_train_title,\
    l_test_mesh, _, l_test_abs, l_test_title,\
    l_valid_mesh, l_valid_abs, l_valid_title = tfu.split_data(l_mesh,\
                                                              l_abs,\
                                                              l_title,\
                                                              CON_LIMIT,\
                                                              trn_limit, tst_limit,\
                                                              'X-BERT')

    l_lists = [
        (l_train_abs, l_train_title, l_train_mesh, out_path + 'train',
         'train'),
        (l_test_abs, l_test_title, l_test_mesh, out_path + 'test', 'test'),
        (l_valid_abs, l_valid_title, l_valid_mesh, out_path + 'valid', 'valid')
    ]

    #Generate Stemmer
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('english')

    for l in l_lists:
        logging.info('Processing %s data...' % l[4])
        print('Processing %s data...' % l[4])
        l_stem_text = []

        if mer:
            l_mer = []
            logging.info('MERing using mesh_lex...')
            print('MERing using mesh_lex...')
            l_mer = mu.call_simple_mer(l[0], n_cores, 'meshlex')

            #appends to the titles the corresponding MER terms iddentified earlier
            for i in range(len(l[1])):
                l[1][i] = l[1][i] + ' ' + str(l_mer[i])

        logging.info('Stemming...')
        print('Stemming...')
        l_stem_text = su.list_stemming(l[1], stemmer)

        logging.info('Writing %s file' % l[3])
        print('Writing %s file' % l[3])
        tfu.write_file(l_stem_text, l[2], l[3])