def __init__(self): ''' Constructor ''' self.source = "E:/corpus/bionlp2011/project_data/" WD = WordDictionary(self.source) WD.load("train") TD = TriggerDictionary(self.source) TD.load("train") self.builder = DocumentBuilder(self.source, WD, TD)
def __init__(self): ''' Constructor ''' source = "E:/corpus/bionlp2011/project_data/" WD = WordDictionary(source) WD.load("dev") TD = TriggerDictionary(source) TD.load("dev") self.TC = SentenceAnalyzer(WD, TD)
def _set(self, dict_type): """ initialize dictionary type to be used in learning process initialize document builder initialize feature extraction """ self.wdict = WordDictionary(self.src) self.wdict.load(dict_type) self.tdict = TriggerDictionary(self.src) self.tdict.load(dict_type) self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict) self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)
def __init__(self): ''' Constructor ''' ''' Constructor ''' out_path = "E:/corpus/bionlp2011/project_test/result/model1" WD = WordDictionary(self.source) WD.load("train") TD = TriggerDictionary(self.source) TD.load("train") self.builder = DocumentBuilder(self.source, WD, TD) self.a2writter = GeniaA2Writer(out_path)
def _set(self, dict_type): """ initialize dictionary type to be used in feature extraction process initialize document builder initialize feature extraction """ self.wdict = WordDictionary(self.src) self.wdict.load(dict_type) self.tdict = TriggerDictionary(self.src) self.tdict.load(dict_type) self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict) self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict) self.a2 = A2Writter(self._out_path)
class Prediction(object): ''' classdocs ''' # suffix and extension of id file DOCID_SUFFIX_EXT = "_doc_ids.json" # directory for saving svm model MODEL_DIR = "/model" # directory for saving output a2 file OUT_DIR = "/result" # list of event name EVENT_NAME = ["None", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Localization", "Binding", "Regulation", "Positive_regulation", "Negative_regulation"] def __init__(self, source, dir_name, dict_type): ''' Constructor ''' self.src = source self._model_path = '' self._out_path = '' self.set_path(source, dir_name) self.dict_type = dict_type self.wdict = None self.tdict = None self.doc_builder = None self.extraction = None self.docs = {} self._set(dict_type) def _set(self, dict_type): """ initialize dictionary type to be used in feature extraction process initialize document builder initialize feature extraction """ self.wdict = WordDictionary(self.src) self.wdict.load(dict_type) self.tdict = TriggerDictionary(self.src) self.tdict.load(dict_type) self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict) self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict) self.a2 = A2Writter(self._out_path) def set_path(self, source, dir_name): """ check whether given dir_name is exist raise error if it does not exist return full _model_path of dir_name """ # model path path = source + self.MODEL_DIR + '/' + dir_name if not os.path.exists(path): raise ValueError(path + "does not exist!!, chose another dir_name for prediction") self._model_path = path # output path path = source + self.OUT_DIR + '/' + dir_name if not os.path.exists(path): os.makedirs(path) self._out_path = path def get_feature(self, step): """ extract feature and return X, Y for a given step step are either one of these: 'tp' => trigger-protein relation 'tt' => trigger-trigger relation to predict regulation event with trigger argument 'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary) 't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary) """ if step not in ['tt','tp','tc','t2']: raise ValueError("only support step for tt, tp, tc and t2") X = [] Y = [] info = [] dt_start = dt.now() # reset statistic of extraction self.extraction.reset_statistic() # init feature print "now extracting", len(self.docs), "docs" for doc_id in self.docs.keys(): o_doc = self.docs[doc_id] if step == 'tp': samples = self.extraction.extract_tp(o_doc) elif step == 'tt': samples = self.extraction.extract_tt(o_doc) elif step == 'tc': samples = self.extraction.extract_tc(o_doc) elif step == 't2': samples = self.extraction.extract_t2(o_doc) for sample in samples: X.append(sample[2]) Y.append(sample[1]) info.append(sample[0]) print "time to extract feature", dt.now() - dt_start return X,Y, info def set_prediction_docs(self,docid_list_fname, is_test = True): """ build a document to be predicted """ dt_start = dt.now() self.docs = {} # get list of file doc_ids = self.get_docid_list(docid_list_fname) print "now building", len(doc_ids), "docs" for doc_id in doc_ids: self.docs[doc_id] = self.doc_builder.build(doc_id, is_test) print "finish built docs in:", dt.now() - dt_start def update_doc_info(self, list_info, list_target, arg_name, arg_type): """ update trigger and relation of document """ for i in range(0,len(list_info)): target = list_target[i] if target < 1: continue info = list_info[i] doc_id = info["doc"] self.docs[doc_id].update(info['sen'], info['t'], self.EVENT_NAME[target], info['a'], arg_name, arg_type) def update_doc_relation(self, rel_type, list_info, list_target): """ update only relation of document """ for i in range(0,len(list_info)): target = list_target[i] if target == 1: info = list_info[i] doc_id = info["doc"] if rel_type == 'cause': arg = info['c'] else: arg = info['a2'] self.docs[doc_id].update_relation(rel_type, info['sen'], info['t'], arg) def get_docid_list(self, docid_list_fname): """ return list of file """ if not isinstance(docid_list_fname, list): # get list of doc ids from file path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT if not os.path.exists(path): raise ValueError(path + " is not exist") with open(path, 'r') as f: doc_ids = json.loads(f.read()) else: doc_ids = docid_list_fname return doc_ids def predict_tp(self, grid_search = True): """ return prediction of given docid_list """ if self.docs == {}: raise ValueError("docs have not been created. call set_prediction_docs first!") # get list of file #doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y, info = self.get_feature('tp') # init svm classifier svm = SVM(self._model_path, "trig-prot", "linear", grid_search = grid_search, class_weight = 'auto') svm.load() return svm.predict(X), Y, info def predict_tt(self, grid_search = True): """ return prediction of given docid_list """ if self.docs == {}: raise ValueError("docs have not been created. call set_prediction_docs first!") # get list of file #doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y, info = self.get_feature('tt') # init svm classifier svm = SVM(self._model_path, "trig-trig", "linear", grid_search = grid_search, class_weight = 'auto') svm.load() return svm.predict(X), Y, info def predict_tc(self, grid_search = True): if self.docs == {}: raise ValueError("docs have not been created. call set_prediction_docs first!") # get list of file #doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y, info = self.get_feature('tc') # init svm classifier svm = SVM(self._model_path, "trig-theme-cause", "linear", grid_search = grid_search, class_weight = 'auto') svm.load() return svm.predict(X), Y, info def predict_t2(self, grid_search = True): if self.docs == {}: raise ValueError("docs have not been created. call set_prediction_docs first!") # get features and target X, Y, info = self.get_feature('t2') # init svm classifier svm = SVM(self._model_path, "trig-theme1-2", "linear", grid_search = grid_search, class_weight = 'auto') svm.load() return svm.predict(X), Y, info def predict(self, docid_list_fname, write_result = True): # create document object for prediction self.set_prediction_docs(docid_list_fname) # predict trigger-protein relation Ypred, _, info = self.predict_tp(grid_search = True) # update document self.update_doc_info(info, Ypred, "Theme", "P") # predict trigger-trigger relation for _ in range(0,2): Ypred, _, info = self.predict_tt(grid_search = True) self.update_doc_info(info, Ypred, "Theme", "E") # predict trigger-theme-cause relation Ypred, _, info = self.predict_tc(grid_search = True) self.update_doc_relation('cause', info, Ypred) # predict theme2 relation Ypred, _, info = self.predict_t2(grid_search = True) self.update_doc_relation('theme2', info, Ypred) # write a2 if write_result: self.write_result() def write_result(self): print "now writing", len(self.docs), "docs result to", self._out_path for doc in self.docs.itervalues(): self.a2.write(doc)
class Learning(object): ''' Learning steps: 1. define docs for learning 2. extract features 3. build input data for classifier 4. build a model and save it ''' # suffix and extension of id file DOCID_SUFFIX_EXT = "_doc_ids.json" # directory for saving svm model MODEL_DIR = "/model" def __init__(self, source, dir_name, dict_type): ''' Constructor ''' self.src = source self.path = self.set_path(source, dir_name) self.dict_type = dict_type self.wdict = None self.tdict = None self.doc_builder = None self.extraction = None self._set(dict_type) def set_path(self, source, dir_name): """ check whether given dir_name is exist raise error if exist, otherwise create new one return full path of dir_name """ path = source + self.MODEL_DIR + '/' + dir_name if os.path.exists(path): raise ValueError(path + "exist!!, chose anoher dir_name for learning") else: # create dir_name os.makedirs(path) return path def _set(self, dict_type): """ initialize dictionary type to be used in learning process initialize document builder initialize feature extraction """ self.wdict = WordDictionary(self.src) self.wdict.load(dict_type) self.tdict = TriggerDictionary(self.src) self.tdict.load(dict_type) self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict) self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict) def get_docid_list(self, docid_list_fname): """ return list of file """ if not isinstance(docid_list_fname, list): # get list of doc ids from file path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT if not os.path.exists(path): raise ValueError(path + " is not exist") with open(path, 'r') as f: doc_ids = json.loads(f.read()) else: doc_ids = docid_list_fname return doc_ids def get_feature(self, doc_ids, step): """ extract feature and return X, Y for a given step step are either one of these: 'tp' => trigger-protein relation 'tt' => trigger-trigger relation to predict regulation event with trigger argument 'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary) 't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary) """ if step not in ['tt', 'tp', 'tc', 't2']: raise ValueError("only support step for tt, tp, tc and t2") X = [] Y = [] dt_start = dt.now() # reset statistic of extraction self.extraction.reset_statistic() # init feature print "now extracting", len(doc_ids), "docs" for doc_id in doc_ids: o_doc = self.doc_builder.build(doc_id) if step == 'tp': samples = self.extraction.extract_tp(o_doc) elif step == 'tt': samples = self.extraction.extract_tt(o_doc) elif step == 'tc': samples = self.extraction.extract_tc(o_doc) elif step == 't2': samples = self.extraction.extract_t2(o_doc) for sample in samples: X.append(sample[2]) Y.append(sample[1]) # print statistic pos = self.extraction.sample_pos neg = self.extraction.sample_neg stat = (pos, neg, pos + neg) print stat print "percentege of positif data:", pos * 100.0 / (pos + neg) print "time to extract feature", dt.now() - dt_start return X, Y def learn_tp(self, docid_list_fname, grid_search): # get list of file doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y = self.get_feature(doc_ids, 'tp') # init svm classifier svm = SVM(self.path, 'trig-prot', 'linear', grid_search=grid_search, class_weight='auto') svm.create() # fit training data svm.learn(X, Y) def learn_tt(self, docid_list_fname, grid_search): # get list of file doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y = self.get_feature(doc_ids, 'tt') # init svm classifier svm = SVM(self.path, 'trig-trig', 'linear', grid_search=grid_search, class_weight='auto') svm.create() # fit training data svm.learn(X, Y) def learn_tc(self, docid_list_fname, grid_search): # get list of file doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y = self.get_feature(doc_ids, 'tc') # init svm classifier svm = SVM(self.path, 'trig-theme-cause', 'linear', grid_search=grid_search, class_weight='auto') svm.create() # fit training data svm.learn(X, Y) def learn_t2(self, docid_list_fname, grid_search): # get list of file doc_ids = self.get_docid_list(docid_list_fname) # get features and target X, Y = self.get_feature(doc_ids, 't2') # init svm classifier svm = SVM(self.path, 'trig-theme1-2', 'linear', grid_search=grid_search, class_weight='auto') svm.create() # fit training data svm.learn(X, Y)
''' Created on Sep 18, 2013 @author: Andresta ''' import json, os from model.Dictionary import WordDictionary, TriggerDictionary from model.Document import DocumentBuilder from collections import Counter source = "E:/corpus/bionlp2011/project_data" WD = WordDictionary(source) WD.load("mix") TD = TriggerDictionary(source) TD.load("mix") builder = DocumentBuilder(source, WD, TD) # counter for dependency edge_cnt = Counter() len_cnt = Counter() # counter for chunk cdist_cnt = Counter() nprep_cnt = Counter() # counter for sentence
cond2 = o_sen.rel.check_relation(trig_wn, arg2_wn, "Theme2", "P") cond3 = o_sen.rel.check_relation(trig_wn, arg1_wn, "Theme2", "P") cond4 = o_sen.rel.check_relation(trig_wn, arg2_wn, "Theme", "P") if (cond1 and cond2) or (cond3 and cond4): label = 1 return label if __name__ == "__main__": source = "E:/corpus/bionlp2011/project_data" doc_id = "PMID-9878621" #doc_id = "PMID-9351352" WD = WordDictionary(source) WD.load("train") TD = TriggerDictionary(source) TD.load("train") builder = DocumentBuilder(source, WD, TD) doc = builder.read_raw(doc_id) o_doc = builder.build_doc_from_raw(doc, is_test=False) FE = FeatureExtraction(source, WD, TD) feature = FE.extract_tp(o_doc) for f in feature[0:50]: print f[0], f[1] print f[2]