예제 #1
0
import os
#from sklearn.metrics import precision_recall_fscore_support, classification_report
from seqeval.metrics import classification_report
from seqeval.metrics.v1 import precision_recall_fscore_support as precision_recall_fscore_support
from seqeval.scheme import IOBES
import Bert
import Cross_validation
from Corpus import Corpus,words2IOBES
from Model import BertRecNER
from Parameters import global_param
from Train import train_save, prediction
from itertools import chain

head =global_param.model_param['head']
corpus=Corpus('data/PGxCorpus','pgx',head=head)
X_app,Y_app, Tokens= corpus.get_data()
print(len(Y_app))

###### param sup #######
do_valid=True
fold_num=10
do_cross_valid=True

nb_epoch =global_param.traning_param['num_ep'] # 5
lr= global_param.traning_param['lr'] # 3e-5
bert_type= global_param.model_param['bert'] # 'bert'
F_type= global_param.traning_param['F_type']  # 'macro'
exp_name= global_param.traning_param['exp_tag']


machine_name = os.uname()[1]
예제 #2
0
def Corpus_Loading(path, name='snpphena'):
    """
    This function load data-set
    :param path: the path of data-set
    :param name: the name of data set
    :return: list of input features and their labels
    """

    bert = global_param.model_param['bert']
    finetuning = '' if not global_param.model_param[
        'fine_tuning'] else 'fine_tuning'

    Features_dir = "./Features"
    if not os.path.exists(Features_dir):
        os.mkdir(Features_dir)

    corpus = Corpus(path, name)

    Features_corpus_dir = "./Features/" + name
    if not os.path.exists(Features_corpus_dir):
        os.mkdir(Features_corpus_dir)

    tag = path.replace('/', '_') + '_' + finetuning + '_' + bert
    if not os.path.exists(Features_corpus_dir + "/" + tag):
        os.mkdir(Features_corpus_dir + "/" + tag)

        dataset_X, dataset_Y_Name = corpus.get_data()

        dataset_XF, dataset_Y = [], []

        pbar = tqdm(total=len(dataset_Y), desc="Features Computing : ")
        for X in dataset_X:
            sentence, entity1, entity2 = X[0], X[1], X[2]

            #FX = Sentence_Features(sentence), entity_featurs(entity1, entity2, sentence)
            ind1, ind2 = indx_entity(sentence,
                                     entity1), indx_entity(sentence, entity2)

            sentence_ = sentence
            if (global_param.corpus_param['annonimitation']):
                masks = global_param.corpus_param['entitys_masks']
                sentence_ = sentence.replace(entity1, masks[0])
                sentence_ = sentence_.replace(entity2, masks[1])

            if (global_param.corpus_param['encapculate']):
                items = global_param.corpus_param['encapsulate_items']
                sentence_ = sentence.replace(entity1,
                                             items[0] + entity1 + items[1])
                sentence_ = sentence_.replace(entity2,
                                              items[2] + entity2 + items[3])
                print(sentence_)

            if (finetuning == ''):
                FX = Sentence_Features(sentence_,
                                       remove_e=False,
                                       inde1=ind1,
                                       inde2=ind2), corpus_type(name)
            else:
                FX = get_bert_inputs(sentence_)  #,type_corpora(name)

            dataset_XF.append(FX)

            pbar.update(1)

        pbar.close()

        Association_type = corpus.Association_type
        for e in dataset_Y_Name:
            dataset_Y.append(Association_type[string_normaliz(e)])

        Save_Featurs(dataset_XF, dataset_Y, Features_corpus_dir + "/" + tag)

    else:

        dataset_XF, dataset_Y = Load_Featurs(Features_corpus_dir + "/" + tag)

    Nb_class = corpus.nb_association
    print("Corpus {} loaded ".format(name))
    print(" NB Class : {} \n NB Relation : {}".format(Nb_class,
                                                      len(dataset_Y)))
    print(" class size ")

    counter = collections.Counter(dataset_Y)
    for i in range(Nb_class):
        print("       C{} [ {} ] ".format(i, counter[i]))

    return dataset_XF, dataset_Y, Nb_class