Example #1
0
    def __init__(self, fname):

        config_file = parsers.read_properties(fname)
        #print("\nConfiguration file {} loaded \n".format(fname))
        self.config_fname = fname

        # load data
        self.filename_embeddings = config_file.getProperty(
            "filename_embeddings")
        self.filename_train = config_file.getProperty("filename_train")
        self.filename_test = config_file.getProperty("filename_test")
        self.filename_dev = config_file.getProperty("filename_dev")

        self.train_id_docs = parsers.readHeadFile(self.filename_train)
        self.dev_id_docs = parsers.readHeadFile(self.filename_dev)
        self.test_id_docs = parsers.readHeadFile(self.filename_test)

        # get labels for the whole collection
        dataset_documents = []
        dataset_documents.extend(self.train_id_docs)
        dataset_documents.extend(self.dev_id_docs)
        dataset_documents.extend(self.test_id_docs)
        self.dataset_set_characters = utils.getCharsFromDocuments(
            dataset_documents)
        self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments(
            dataset_documents)
        self.dataset_set_relations = utils.getRelationsFromDocuments(
            dataset_documents)
        self.dataset_set_pos1, self.dataset_set_pos2 = utils.getPositionFromDocuments(
            dataset_documents)

        if os.path.isfile(self.filename_embeddings + ".pkl") == False:
            self.wordvectors, self.representationsize, self.words = utils.readWordvectorsNumpy(
                self.filename_embeddings,
                isBinary=True
                if self.filename_embeddings.endswith(".bin") else False)
            self.wordindices = utils.readIndices(
                self.filename_embeddings,
                isBinary=True
                if self.filename_embeddings.endswith(".bin") else False)
            joblib.dump((self.wordvectors, self.representationsize, self.words,
                         self.wordindices), self.filename_embeddings + ".pkl")

        else:
            self.wordvectors, self.representationsize, self.words, self.wordindices = joblib.load(
                self.filename_embeddings + ".pkl")  # loading is faster

        parsers.preprocess(self.train_id_docs, self.wordindices,
                           self.dataset_set_characters, self.dataset_set_pos1,
                           self.dataset_set_pos2, self.dataset_set_bio_tags,
                           self.dataset_set_ec_tags,
                           self.dataset_set_relations)
        parsers.preprocess(self.dev_id_docs, self.wordindices,
                           self.dataset_set_characters, self.dataset_set_pos1,
                           self.dataset_set_pos2, self.dataset_set_bio_tags,
                           self.dataset_set_ec_tags,
                           self.dataset_set_relations)
        parsers.preprocess(self.test_id_docs, self.wordindices,
                           self.dataset_set_characters, self.dataset_set_pos1,
                           self.dataset_set_pos2, self.dataset_set_bio_tags,
                           self.dataset_set_ec_tags,
                           self.dataset_set_relations)

        # training
        self.nepochs = int(config_file.getProperty("nepochs"))
        self.optimizer = config_file.getProperty("optimizer")
        self.activation = config_file.getProperty("activation")
        self.learning_rate = float(config_file.getProperty("learning_rate"))
        self.gradientClipping = utils.strToBool(
            config_file.getProperty("gradientClipping"))
        self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv"))
        self.use_dropout = utils.strToBool(
            config_file.getProperty("use_dropout"))
        self.ner_loss = config_file.getProperty("ner_loss")
        self.ner_classes = config_file.getProperty("ner_classes")
        self.use_chars = utils.strToBool(config_file.getProperty("use_chars"))
        self.use_adversarial = utils.strToBool(
            config_file.getProperty("use_adversarial"))
        self.initializer = config_file.getProperty("initializer")
        self.use_position = utils.strToBool(
            config_file.getProperty("use_position"))
        self.use_GRU = utils.strToBool(config_file.getProperty("use_GRU"))
        self.self_attention = utils.strToBool(
            config_file.getProperty("self_attention"))
        self.use_bias = utils.strToBool(config_file.getProperty("use_bias"))

        # hyperparameters
        self.dropout_embedding = float(
            config_file.getProperty("dropout_embedding"))
        self.dropout_lstm = float(config_file.getProperty("dropout_lstm"))
        self.dropout_lstm_output = float(
            config_file.getProperty("dropout_lstm_output"))
        self.dropout_fcl_ner = float(
            config_file.getProperty("dropout_fcl_ner"))
        self.dropout_fcl_rel = float(
            config_file.getProperty("dropout_fcl_rel"))
        self.gru_keep_prob = float(config_file.getProperty("gru_keep_prob"))
        self.hidden_size_lstm = int(
            config_file.getProperty("hidden_size_lstm"))
        self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1"))
        #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2")
        self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers"))
        self.char_embeddings_size = int(
            config_file.getProperty("char_embeddings_size"))
        self.hidden_size_char = int(
            config_file.getProperty("hidden_size_char"))
        self.label_embeddings_size = int(
            config_file.getProperty("label_embeddings_size"))
        self.alpha = float(config_file.getProperty("alpha"))
        self.lmcost_lstm_gamma = float(
            config_file.getProperty("lmcost_lstm_gamma"))
        self.lmcost_joint_lstm_gamma = float(
            config_file.getProperty("lmcost_joint_lstm_gamma"))
        self.lmcost_hidden_layer_size = float(
            config_file.getProperty("lmcost_hidden_layer_size"))
        self.lmcost_max_vocab_size = int(
            config_file.getProperty("lmcost_max_vocab_size"))
        self.gru_size = int(config_file.getProperty("gru_size"))
        self.pos_num = int(config_file.getProperty("pos_num"))
        self.pos_size = int(config_file.getProperty("pos_size"))
        self.num_heads = int(config_file.getProperty("num_heads"))
        self.weight_b = float(config_file.getProperty("weight_b"))

        # evaluation
        self.evaluation_method = config_file.getProperty("evaluation_method")
        self.root_node = bool(config_file.getProperty("root_node"))

        self.shuffle = False
        self.batchsize = 1
    def __init__(self,fname):


        config_file=parsers.read_properties(fname)
        #print("\nConfiguration file {} loaded \n".format(fname))
        self.config_fname=fname

        # load data
        self.pretrained_embeddings=utils.strToBool(config_file.getProperty("pretrained_embeddings"))


        self.filename_embeddings = config_file.getProperty("filename_embeddings")

        #print(os.path.basename(self.filename_embeddings))

        name_of_embeddings = ""


        self.embeddings_size=int(config_file.getProperty("embeddings_size"))
        self.word_to_ix={}

        if self.pretrained_embeddings==True:

            name_of_embeddings = "_"+os.path.basename(self.filename_embeddings)

            if os.path.isfile(self.filename_embeddings+".pkl")==False:
                        self.wordvectors,  self.embeddings_size, self.word_to_ix = utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False)


                        joblib.dump(( self.wordvectors,  self.embeddings_size, self.word_to_ix), self.filename_embeddings+".pkl")

            else:
                    self.wordvectors, self.embeddings_size, self.word_to_ix = joblib.load(self.filename_embeddings + ".pkl")  # loading is faster



        self.filename_train=config_file.getProperty("filename_train")
        self.filename_dev = config_file.getProperty("filename_dev")
        self.filename_test=config_file.getProperty("filename_test")
        self.pad_length = int(config_file.getProperty("pad_length"))
        '''
        train = reader.BinDataset(self.filename_train, isTrain=True, pretrained_embeddings=self.pretrained_embeddings,
                                  word_to_ix=self.word_to_ix,pad_length=self.pad_length)
        self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries()
        
        dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)
        test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)
        '''
        if os.path.isfile(self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False:

            train = reader.BinDataset(self.filename_train,isTrain=True,pretrained_embeddings=self.pretrained_embeddings,word_to_ix=self.word_to_ix,pad_length=self.pad_length)

            joblib.dump(train, self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+  "_tweet.pkl")

        else:
            train = joblib.load(self.filename_train+name_of_embeddings+"_"+str(self.pad_length)+  "_tweet.pkl")  # loading is faster

        self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries()
        

        if os.path.isfile(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl") == False:

            dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)

            joblib.dump(dev, self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")

        else:
            dev = joblib.load(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")  # loading is faster



        if os.path.isfile(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl") == False:

            test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length)

            joblib.dump(test, self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")

        else:
            test = joblib.load(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+   "_tweet.pkl")  # loading is faster
        

        
        print (train)

        self.train_loader = DataLoader(train, batch_size=1, shuffle=False)
        self.dev_loader = DataLoader(dev, batch_size=1, shuffle=False)
        self.test_loader = DataLoader(test, batch_size=1, shuffle=False)

        print ()
        #self.dev_id_docs = parsers.readHeadFile( self.filename_dev)
        #self.test_id_docs = parsers.readHeadFile(self.filename_test)

        # get labels for the whole collection
        #dataset_documents = []
        #dataset_documents.extend(self.train_id_docs)
        #dataset_documents.extend(self.dev_id_docs)
        #dataset_documents.extend(self.test_id_docs)
        #self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents)
        #self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments(dataset_documents)
        #self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents)
        #print (len(self.dataset_set_characters))
        #print(len(self.dataset_set_bio_tags))

        #print((self.dataset_set_characters))
        # print((self.dataset_set_bio_tags))








       # training
        self.nepochs = int(config_file.getProperty("nepochs"))
        self.optimizer = config_file.getProperty("optimizer")
        #self.activation =config_file.getProperty("activation")
        self.learning_rate =float(config_file.getProperty("learning_rate"))
                                                                                             
        #self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv"))
        self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout"))
        self.use_BIO_LSTM = utils.strToBool(config_file.getProperty("use_BIO_LSTM"))
        self.ner_loss = config_file.getProperty("ner_loss")
        self.ner_classes = config_file.getProperty("ner_classes")
        self.bin_features = config_file.getProperty("bin_features").lower()
        self.tweet_representation = config_file.getProperty("tweet_representation").lower()
        self.non_linearity_bin_features = config_file.getProperty("non_linearity_bin_features").lower()
        try:
            self.threshold = float(config_file.getProperty("threshold"))
        except:
            self.threshold=0
                                                                                           


        # hyperparameters
        self.n_filters        = int(config_file.getProperty("n_filters"))
        self.filter_sizes     = utils.strToLst(config_file.getProperty("filter_sizes"))
        self.batch_norm       = utils.strToBool(config_file.getProperty("batch_norm"))
        self.cnn_pool         = config_file.getProperty("cnn_pool").lower()
        self.dropout_cnn      = float(config_file.getProperty("dropout_cnn"))
        self.bin_representation = config_file.getProperty("bin_representation").lower()
        self.dropout_lstm1_output = float(config_file.getProperty("dropout_lstm1_output"))




        self.dropout_embedding = float(config_file.getProperty("dropout_embedding"))
        #self.dropout_lstm = float(config_file.getProperty("dropout_lstm"))
        self.dropout_lstm2_output = float(config_file.getProperty("dropout_lstm2_output"))
        self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner"))
        self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel"))
        #self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm"))
        self.hidden_dim = int(config_file.getProperty("hidden_dim"))
        #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2")
        self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers"))
        #self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size"))
        #self.hidden_size_char = int(config_file.getProperty("hidden_size_char"))
        #self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size"))
        #self.alpha = float(config_file.getProperty("alpha"))

        # evaluation
        self.evaluation_method =config_file.getProperty("evaluation_method")
        #self.root_node=bool(config_file.getProperty("root_node"))

        self.shuffle=False
Example #3
0
print "config:"
for c in config:
    print str(c) + "\t" + str(config[c])

datafile = config["file"]
fp = open(datafile + "_indexMapping", 'rb')
sentId2newIndex2oldIndex = pickle.load(fp)
fp.close()
iterationSeed = -1
if "iterationSeed" in config:
    iterationSeed = int(config["iterationSeed"])
    print "using " + str(iterationSeed) + " as seed for iteration scheme"
pretrainedEmbeddings = False
if "wordvectors" in config:
    wordvectorfile = config["wordvectors"]
    wordvectors, representationsize, words = readWordvectorsNumpy(
        wordvectorfile, isWord2vec=True)
    vocabsize = wordvectors.shape[0]
    pretrainedEmbeddings = True
else:
    print "you have to specify a wordvector file in the config"
    exit()
networkfile = config["net"]
lrateOrig = float(config["lrate"])
print "using sgd with learning rate ", lrateOrig
batch_size = int(config["batchsize"])
contextsize = int(config["contextsize"])
entitysize = int(config["entitysize"])
myLambda1 = 0
if "lambda1" in config:
    myLambda1 = float(config["lambda1"])
myLambda2 = 0
    def __init__(self,fname):


        config_file=parsers.read_properties(fname)
        #print("\nConfiguration file {} loaded \n".format(fname))
        self.config_fname=fname

        # load data
        self.filename_embeddings = config_file.getProperty("filename_embeddings")
        self.filename_train=config_file.getProperty("filename_train")
        self.filename_test=config_file.getProperty("filename_test")
        self.filename_dev=config_file.getProperty("filename_dev")
        
        self.train_id_docs = parsers.readHeadFile(self.filename_train) ## list of headIdDoc class
        self.dev_id_docs = [] # parsers.readHeadFile( self.filename_dev) ## list of headIdDoc class
        self.test_id_docs = [] # parsers.readHeadFile(self.filename_test) ## list of headIdDoc class

        # get labels for the whole collection
        dataset_documents = []
        dataset_documents.extend(self.train_id_docs)  
        dataset_documents.extend(self.dev_id_docs)
        dataset_documents.extend(self.test_id_docs)
        self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents)
        self.dataset_set_bio_tags = utils.getEntitiesFromDocuments(dataset_documents)  ## get BIO tags. 'B-AE', 'I-AE', 'O', 'B-D', 'I-D'
        self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents) ## get relations. 'Kill'
        self.dataset_set_bio_relation_ners = utils.getRelationNersFromDocuments(dataset_documents) ## get relation-ner set. for example 'Kill__B-Peop', 'Kill__I-Peop'


        if os.path.isfile(self.filename_embeddings+".pkl")==False:
            self.wordvectors, self.representationsize, self.words, self.wordindices = utils.readWordvectorsNumpy(dataset_documents, self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False)
            # self.wordindices = utils.readIndices(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False)
            joblib.dump((self.wordvectors, self.representationsize, self.words,self.wordindices), self.filename_embeddings+".pkl")

        else:
            self.wordvectors, self.representationsize, self.words,self.wordindices = joblib.load(self.filename_embeddings + ".pkl")  # loading is faster



        parsers.preprocess(self.train_id_docs, self.wordindices, self.dataset_set_characters,
                           self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners)

        parsers.preprocess(self.dev_id_docs, self.wordindices, self.dataset_set_characters,
                           self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners)

        parsers.preprocess(self.test_id_docs, self.wordindices, self.dataset_set_characters,
                           self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners)


        # training
        self.nepochs = int(config_file.getProperty("nepochs"))
        self.optimizer = config_file.getProperty("optimizer")
        self.activation =config_file.getProperty("activation")
        self.learning_rate =float(config_file.getProperty("learning_rate"))
        self.gradientClipping = utils.strToBool(config_file.getProperty("gradientClipping"))
        self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv"))
        self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout"))
        self.ner1_loss = config_file.getProperty("ner1_loss")
        self.ner2_loss = config_file.getProperty("ner2_loss")
        self.ner_classes = config_file.getProperty("ner_classes")
        self.use_chars = utils.strToBool(config_file.getProperty("use_chars"))
        self.use_adversarial = utils.strToBool(config_file.getProperty("use_adversarial"))

        # hyperparameters
        self.dropout_embedding = float(config_file.getProperty("dropout_embedding"))
        self.dropout_lstm = float(config_file.getProperty("dropout_lstm"))
        self.dropout_lstm_output = float(config_file.getProperty("dropout_lstm_output"))
        self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner"))
        self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel"))
        self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm"))
        self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1"))
        #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2")
        self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers"))
        self.num_heads = int(config_file.getProperty("num_heads"))
        self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size"))
        self.hidden_size_char = int(config_file.getProperty("hidden_size_char"))
        self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size"))
        self.attention_size = int(config_file.getProperty("attention_size"))
        self.alpha = float(config_file.getProperty("alpha"))

        # evaluation
        self.evaluation_method =config_file.getProperty("evaluation_method")
        self.root_node=utils.strToBool(config_file.getProperty("root_node"))

        self.shuffle=False
        self.batchsize=1