Ejemplo n.º 1
0
 def __init__(self):
     '''
     Constructor
     '''
     self.source = "E:/corpus/bionlp2011/project_data/"        
     
     WD = WordDictionary(self.source)    
     WD.load("train")
            
     TD = TriggerDictionary(self.source)
     TD.load("train")
     
     self.builder = DocumentBuilder(self.source, WD, TD)
Ejemplo n.º 2
0
    def __init__(self):
        '''
        Constructor
        '''
        self.source = "E:/corpus/bionlp2011/project_data/"

        WD = WordDictionary(self.source)
        WD.load("train")

        TD = TriggerDictionary(self.source)
        TD.load("train")

        self.builder = DocumentBuilder(self.source, WD, TD)
Ejemplo n.º 3
0
 def __init__(self):
     '''
     Constructor
     '''
     source = "E:/corpus/bionlp2011/project_data/"
 
     WD = WordDictionary(source)    
     WD.load("dev")
    
     TD = TriggerDictionary(source)
     TD.load("dev")
                 
     self.TC = SentenceAnalyzer(WD, TD)
Ejemplo n.º 4
0
    def _set(self, dict_type):
        """
        initialize dictionary type to be used in learning process
        initialize document builder
        initialize feature extraction
        """

        self.wdict = WordDictionary(self.src)
        self.wdict.load(dict_type)

        self.tdict = TriggerDictionary(self.src)
        self.tdict.load(dict_type)

        self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)
        self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)
Ejemplo n.º 5
0
    def __init__(self):
        '''
        Constructor
        '''
        '''
        Constructor
        '''

        out_path = "E:/corpus/bionlp2011/project_test/result/model1"

        WD = WordDictionary(self.source)
        WD.load("train")

        TD = TriggerDictionary(self.source)
        TD.load("train")

        self.builder = DocumentBuilder(self.source, WD, TD)
        self.a2writter = GeniaA2Writer(out_path)
Ejemplo n.º 6
0
 def __init__(self):
     '''
     Constructor
     '''
     '''
     Constructor
     '''
            
     out_path = "E:/corpus/bionlp2011/project_test/result/model1" 
     
     WD = WordDictionary(self.source)    
     WD.load("train")
            
     TD = TriggerDictionary(self.source)
     TD.load("train")
     
     self.builder = DocumentBuilder(self.source, WD, TD)
     self.a2writter = GeniaA2Writer(out_path)
Ejemplo n.º 7
0
 def _set(self, dict_type):
     """
     initialize dictionary type to be used in feature extraction process
     initialize document builder
     initialize feature extraction
     """       
     
     self.wdict = WordDictionary(self.src)    
     self.wdict.load(dict_type)
            
     self.tdict = TriggerDictionary(self.src)
     self.tdict.load(dict_type)
     
     self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)         
     self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)
     
     self.a2 = A2Writter(self._out_path)
Ejemplo n.º 8
0
class Prediction(object):
    '''
    classdocs
    '''
    
    # suffix and extension of id file
    DOCID_SUFFIX_EXT = "_doc_ids.json"
    
    # directory for saving svm model
    MODEL_DIR = "/model"
    
    # directory for saving output a2 file
    OUT_DIR = "/result"

    # list of event name
    EVENT_NAME = ["None",
                  "Gene_expression",
                  "Transcription",
                  "Protein_catabolism",
                  "Phosphorylation",
                  "Localization",
                  "Binding",
                  "Regulation",
                  "Positive_regulation",
                  "Negative_regulation"]
        

    def __init__(self, source, dir_name, dict_type):
        '''
        Constructor
        '''
        self.src = source
        self._model_path = '' 
        self._out_path = ''
        self.set_path(source, dir_name)
        
        self.dict_type = dict_type
        self.wdict = None
        self.tdict = None
        self.doc_builder = None
        self.extraction = None      
        
        self.docs = {}          
        
        self._set(dict_type)
    
    def _set(self, dict_type):
        """
        initialize dictionary type to be used in feature extraction process
        initialize document builder
        initialize feature extraction
        """       
        
        self.wdict = WordDictionary(self.src)    
        self.wdict.load(dict_type)
               
        self.tdict = TriggerDictionary(self.src)
        self.tdict.load(dict_type)
        
        self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)         
        self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)
        
        self.a2 = A2Writter(self._out_path)
        
    def set_path(self, source, dir_name):
        """
        check whether given dir_name is exist
        raise error if it does not exist
        return full _model_path of dir_name
        """
        # model path
        path = source + self.MODEL_DIR + '/' + dir_name
        if not os.path.exists(path):
            raise ValueError(path + "does not exist!!, chose another dir_name for prediction")        
        self._model_path = path
        
        # output path
        path = source + self.OUT_DIR + '/' + dir_name
        if not os.path.exists(path):
            os.makedirs(path)        
        self._out_path = path 
       
        
    def get_feature(self, step):
        """
        extract feature and return X, Y for a given step
        step are either one of these:
        'tp' => trigger-protein relation
        'tt' => trigger-trigger relation to predict regulation event with trigger argument  
        'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary)
        't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary)
        """
        if step not in ['tt','tp','tc','t2']:
            raise ValueError("only support step for tt, tp, tc and t2")
        
        X = []
        Y = []
        info = []
        
        dt_start = dt.now()        
        
        # reset statistic of extraction
        self.extraction.reset_statistic()
                      
        # init feature
        print "now extracting", len(self.docs), "docs"
        for doc_id in self.docs.keys():             
            o_doc = self.docs[doc_id]
            if step == 'tp':
                samples = self.extraction.extract_tp(o_doc)
            elif step == 'tt':
                samples = self.extraction.extract_tt(o_doc)
            elif step == 'tc':
                samples = self.extraction.extract_tc(o_doc)
            elif step == 't2':
                samples = self.extraction.extract_t2(o_doc)
            
            for sample in samples:
                X.append(sample[2])
                Y.append(sample[1])      
                info.append(sample[0])             
                
        print "time to extract feature", dt.now() - dt_start
        
        return X,Y, info
    
    def set_prediction_docs(self,docid_list_fname, is_test = True):
        """
        build a document to be predicted
        """
        dt_start = dt.now()      
        self.docs = {}
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)
        
        print "now building", len(doc_ids), "docs"
        for doc_id in doc_ids:
            self.docs[doc_id] = self.doc_builder.build(doc_id, is_test)
            
        print "finish built docs in:", dt.now() - dt_start

    def update_doc_info(self, list_info, list_target, arg_name, arg_type):
        """
        update trigger and relation of document
        """
        for i in range(0,len(list_info)):
            target = list_target[i]
            if target < 1: continue
            info = list_info[i]
            doc_id = info["doc"]
            self.docs[doc_id].update(info['sen'], info['t'], self.EVENT_NAME[target], info['a'], arg_name, arg_type)
            
    def update_doc_relation(self, rel_type, list_info, list_target):
        """
        update only relation of document
        """
        for i in range(0,len(list_info)):
            target = list_target[i]
            if target == 1:
                info = list_info[i]
                doc_id = info["doc"]
                
                if rel_type == 'cause':
                    arg = info['c']
                else:
                    arg = info['a2']
                self.docs[doc_id].update_relation(rel_type, info['sen'], info['t'], arg)

    
    def get_docid_list(self, docid_list_fname):
        """
        return list of file
        """
        if not isinstance(docid_list_fname, list):
            # get list of doc ids from file
            path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT
            if not os.path.exists(path):
                raise ValueError(path + " is not exist")
            with open(path, 'r') as f: 
                doc_ids = json.loads(f.read())
        else:
            doc_ids = docid_list_fname
        
        return doc_ids
    
    def predict_tp(self, grid_search = True):
        """
        return prediction of given docid_list
        """
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tp')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-prot", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
        
    def predict_tt(self, grid_search = True):
        """
        return prediction of given docid_list
        """
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tt')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-trig", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
    
    def predict_tc(self, grid_search = True):
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tc')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-theme-cause", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
    
    def predict_t2(self, grid_search = True):
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")

        # get features and target
        X, Y, info = self.get_feature('t2')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-theme1-2", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
        
    def predict(self, docid_list_fname, write_result = True):
        
        # create document object for prediction
        self.set_prediction_docs(docid_list_fname)
        
        # predict trigger-protein relation
        Ypred, _, info = self.predict_tp(grid_search = True)
        # update document
        self.update_doc_info(info, Ypred, "Theme", "P")
        
        # predict trigger-trigger relation
        for _ in range(0,2):
            Ypred, _, info = self.predict_tt(grid_search = True)
            self.update_doc_info(info, Ypred, "Theme", "E")
        
        # predict trigger-theme-cause relation
        Ypred, _, info = self.predict_tc(grid_search = True)
        self.update_doc_relation('cause', info, Ypred)
        
        # predict theme2 relation
        Ypred, _, info = self.predict_t2(grid_search = True)
        self.update_doc_relation('theme2', info, Ypred)
        
        # write a2
        if write_result:
            self.write_result()
        
    def write_result(self):
        print "now writing", len(self.docs), "docs result to", self._out_path
        for doc in self.docs.itervalues():
            self.a2.write(doc)
Ejemplo n.º 9
0
class Learning(object):
    '''
    Learning steps:
    1. define docs for learning
    2. extract features
    3. build input data for classifier
    4. build a model and save it
    '''

    # suffix and extension of id file
    DOCID_SUFFIX_EXT = "_doc_ids.json"

    # directory for saving svm model
    MODEL_DIR = "/model"

    def __init__(self, source, dir_name, dict_type):
        '''
        Constructor
        '''
        self.src = source
        self.path = self.set_path(source, dir_name)

        self.dict_type = dict_type
        self.wdict = None
        self.tdict = None
        self.doc_builder = None
        self.extraction = None

        self._set(dict_type)

    def set_path(self, source, dir_name):
        """
        check whether given dir_name is exist
        raise error if exist, otherwise create new one
        return full path of dir_name
        """
        path = source + self.MODEL_DIR + '/' + dir_name
        if os.path.exists(path):
            raise ValueError(path +
                             "exist!!, chose anoher dir_name for learning")
        else:
            # create dir_name
            os.makedirs(path)
        return path

    def _set(self, dict_type):
        """
        initialize dictionary type to be used in learning process
        initialize document builder
        initialize feature extraction
        """

        self.wdict = WordDictionary(self.src)
        self.wdict.load(dict_type)

        self.tdict = TriggerDictionary(self.src)
        self.tdict.load(dict_type)

        self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)
        self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)

    def get_docid_list(self, docid_list_fname):
        """
        return list of file
        """
        if not isinstance(docid_list_fname, list):
            # get list of doc ids from file
            path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT
            if not os.path.exists(path):
                raise ValueError(path + " is not exist")
            with open(path, 'r') as f:
                doc_ids = json.loads(f.read())
        else:
            doc_ids = docid_list_fname

        return doc_ids

    def get_feature(self, doc_ids, step):
        """
        extract feature and return X, Y for a given step
        step are either one of these:
        'tp' => trigger-protein relation
        'tt' => trigger-trigger relation to predict regulation event with trigger argument  
        'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary)
        't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary)
        """
        if step not in ['tt', 'tp', 'tc', 't2']:
            raise ValueError("only support step for tt, tp, tc and t2")

        X = []
        Y = []

        dt_start = dt.now()

        # reset statistic of extraction
        self.extraction.reset_statistic()

        # init feature
        print "now extracting", len(doc_ids), "docs"
        for doc_id in doc_ids:
            o_doc = self.doc_builder.build(doc_id)
            if step == 'tp':
                samples = self.extraction.extract_tp(o_doc)
            elif step == 'tt':
                samples = self.extraction.extract_tt(o_doc)
            elif step == 'tc':
                samples = self.extraction.extract_tc(o_doc)
            elif step == 't2':
                samples = self.extraction.extract_t2(o_doc)

            for sample in samples:
                X.append(sample[2])
                Y.append(sample[1])

        # print statistic
        pos = self.extraction.sample_pos
        neg = self.extraction.sample_neg
        stat = (pos, neg, pos + neg)
        print stat
        print "percentege of positif data:", pos * 100.0 / (pos + neg)
        print "time to extract feature", dt.now() - dt_start

        return X, Y

    def learn_tp(self, docid_list_fname, grid_search):

        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tp')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-prot',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_tt(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tt')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-trig',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_tc(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tc')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-theme-cause',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_t2(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 't2')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-theme1-2',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)
Ejemplo n.º 10
0
'''
Created on Sep 18, 2013

@author: Andresta
'''

import json, os
from model.Dictionary import WordDictionary, TriggerDictionary
from model.Document import DocumentBuilder

from collections import Counter

source = "E:/corpus/bionlp2011/project_data"

WD = WordDictionary(source)    
WD.load("mix")
       
TD = TriggerDictionary(source)
TD.load("mix")

builder = DocumentBuilder(source, WD, TD)

# counter for dependency
edge_cnt = Counter()
len_cnt = Counter()

# counter for chunk
cdist_cnt = Counter()
nprep_cnt = Counter()

# counter for sentence
Ejemplo n.º 11
0
'''
Created on Sep 18, 2013

@author: Andresta
'''

import json, os
from model.Dictionary import WordDictionary, TriggerDictionary
from model.Document import DocumentBuilder

from collections import Counter

source = "E:/corpus/bionlp2011/project_data"

WD = WordDictionary(source)
WD.load("mix")

TD = TriggerDictionary(source)
TD.load("mix")

builder = DocumentBuilder(source, WD, TD)

# counter for dependency
edge_cnt = Counter()
len_cnt = Counter()

# counter for chunk
cdist_cnt = Counter()
nprep_cnt = Counter()

# counter for sentence
Ejemplo n.º 12
0
        cond2 = o_sen.rel.check_relation(trig_wn, arg2_wn, "Theme2", "P")
        cond3 = o_sen.rel.check_relation(trig_wn, arg1_wn, "Theme2", "P")
        cond4 = o_sen.rel.check_relation(trig_wn, arg2_wn, "Theme", "P")
        if (cond1 and cond2) or (cond3 and cond4):
            label = 1

        return label


if __name__ == "__main__":

    source = "E:/corpus/bionlp2011/project_data"
    doc_id = "PMID-9878621"
    #doc_id = "PMID-9351352"

    WD = WordDictionary(source)
    WD.load("train")

    TD = TriggerDictionary(source)
    TD.load("train")

    builder = DocumentBuilder(source, WD, TD)
    doc = builder.read_raw(doc_id)

    o_doc = builder.build_doc_from_raw(doc, is_test=False)

    FE = FeatureExtraction(source, WD, TD)
    feature = FE.extract_tp(o_doc)
    for f in feature[0:50]:
        print f[0], f[1]
        print f[2]