import os #from sklearn.metrics import precision_recall_fscore_support, classification_report from seqeval.metrics import classification_report from seqeval.metrics.v1 import precision_recall_fscore_support as precision_recall_fscore_support from seqeval.scheme import IOBES import Bert import Cross_validation from Corpus import Corpus,words2IOBES from Model import BertRecNER from Parameters import global_param from Train import train_save, prediction from itertools import chain head =global_param.model_param['head'] corpus=Corpus('data/PGxCorpus','pgx',head=head) X_app,Y_app, Tokens= corpus.get_data() print(len(Y_app)) ###### param sup ####### do_valid=True fold_num=10 do_cross_valid=True nb_epoch =global_param.traning_param['num_ep'] # 5 lr= global_param.traning_param['lr'] # 3e-5 bert_type= global_param.model_param['bert'] # 'bert' F_type= global_param.traning_param['F_type'] # 'macro' exp_name= global_param.traning_param['exp_tag'] machine_name = os.uname()[1]
def Corpus_Loading(path, name='snpphena'): """ This function load data-set :param path: the path of data-set :param name: the name of data set :return: list of input features and their labels """ bert = global_param.model_param['bert'] finetuning = '' if not global_param.model_param[ 'fine_tuning'] else 'fine_tuning' Features_dir = "./Features" if not os.path.exists(Features_dir): os.mkdir(Features_dir) corpus = Corpus(path, name) Features_corpus_dir = "./Features/" + name if not os.path.exists(Features_corpus_dir): os.mkdir(Features_corpus_dir) tag = path.replace('/', '_') + '_' + finetuning + '_' + bert if not os.path.exists(Features_corpus_dir + "/" + tag): os.mkdir(Features_corpus_dir + "/" + tag) dataset_X, dataset_Y_Name = corpus.get_data() dataset_XF, dataset_Y = [], [] pbar = tqdm(total=len(dataset_Y), desc="Features Computing : ") for X in dataset_X: sentence, entity1, entity2 = X[0], X[1], X[2] #FX = Sentence_Features(sentence), entity_featurs(entity1, entity2, sentence) ind1, ind2 = indx_entity(sentence, entity1), indx_entity(sentence, entity2) sentence_ = sentence if (global_param.corpus_param['annonimitation']): masks = global_param.corpus_param['entitys_masks'] sentence_ = sentence.replace(entity1, masks[0]) sentence_ = sentence_.replace(entity2, masks[1]) if (global_param.corpus_param['encapculate']): items = global_param.corpus_param['encapsulate_items'] sentence_ = sentence.replace(entity1, items[0] + entity1 + items[1]) sentence_ = sentence_.replace(entity2, items[2] + entity2 + items[3]) print(sentence_) if (finetuning == ''): FX = Sentence_Features(sentence_, remove_e=False, inde1=ind1, inde2=ind2), corpus_type(name) else: FX = get_bert_inputs(sentence_) #,type_corpora(name) dataset_XF.append(FX) pbar.update(1) pbar.close() Association_type = corpus.Association_type for e in dataset_Y_Name: dataset_Y.append(Association_type[string_normaliz(e)]) Save_Featurs(dataset_XF, dataset_Y, Features_corpus_dir + "/" + tag) else: dataset_XF, dataset_Y = Load_Featurs(Features_corpus_dir + "/" + tag) Nb_class = corpus.nb_association print("Corpus {} loaded ".format(name)) print(" NB Class : {} \n NB Relation : {}".format(Nb_class, len(dataset_Y))) print(" class size ") counter = collections.Counter(dataset_Y) for i in range(Nb_class): print(" C{} [ {} ] ".format(i, counter[i])) return dataset_XF, dataset_Y, Nb_class