def __init__(self, fname): config_file = parsers.read_properties(fname) #print("\nConfiguration file {} loaded \n".format(fname)) self.config_fname = fname # load data self.filename_embeddings = config_file.getProperty( "filename_embeddings") self.filename_train = config_file.getProperty("filename_train") self.filename_test = config_file.getProperty("filename_test") self.filename_dev = config_file.getProperty("filename_dev") self.train_id_docs = parsers.readHeadFile(self.filename_train) self.dev_id_docs = parsers.readHeadFile(self.filename_dev) self.test_id_docs = parsers.readHeadFile(self.filename_test) # get labels for the whole collection dataset_documents = [] dataset_documents.extend(self.train_id_docs) dataset_documents.extend(self.dev_id_docs) dataset_documents.extend(self.test_id_docs) self.dataset_set_characters = utils.getCharsFromDocuments( dataset_documents) self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments( dataset_documents) self.dataset_set_relations = utils.getRelationsFromDocuments( dataset_documents) self.dataset_set_pos1, self.dataset_set_pos2 = utils.getPositionFromDocuments( dataset_documents) if os.path.isfile(self.filename_embeddings + ".pkl") == False: self.wordvectors, self.representationsize, self.words = utils.readWordvectorsNumpy( self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) self.wordindices = utils.readIndices( self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) joblib.dump((self.wordvectors, self.representationsize, self.words, self.wordindices), self.filename_embeddings + ".pkl") else: self.wordvectors, self.representationsize, self.words, self.wordindices = joblib.load( self.filename_embeddings + ".pkl") # loading is faster parsers.preprocess(self.train_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_pos1, self.dataset_set_pos2, self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) parsers.preprocess(self.dev_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_pos1, self.dataset_set_pos2, self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) parsers.preprocess(self.test_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_pos1, self.dataset_set_pos2, self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) # training self.nepochs = int(config_file.getProperty("nepochs")) self.optimizer = config_file.getProperty("optimizer") self.activation = config_file.getProperty("activation") self.learning_rate = float(config_file.getProperty("learning_rate")) self.gradientClipping = utils.strToBool( config_file.getProperty("gradientClipping")) self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv")) self.use_dropout = utils.strToBool( config_file.getProperty("use_dropout")) self.ner_loss = config_file.getProperty("ner_loss") self.ner_classes = config_file.getProperty("ner_classes") self.use_chars = utils.strToBool(config_file.getProperty("use_chars")) self.use_adversarial = utils.strToBool( config_file.getProperty("use_adversarial")) self.initializer = config_file.getProperty("initializer") self.use_position = utils.strToBool( config_file.getProperty("use_position")) self.use_GRU = utils.strToBool(config_file.getProperty("use_GRU")) self.self_attention = utils.strToBool( config_file.getProperty("self_attention")) self.use_bias = utils.strToBool(config_file.getProperty("use_bias")) # hyperparameters self.dropout_embedding = float( config_file.getProperty("dropout_embedding")) self.dropout_lstm = float(config_file.getProperty("dropout_lstm")) self.dropout_lstm_output = float( config_file.getProperty("dropout_lstm_output")) self.dropout_fcl_ner = float( config_file.getProperty("dropout_fcl_ner")) self.dropout_fcl_rel = float( config_file.getProperty("dropout_fcl_rel")) self.gru_keep_prob = float(config_file.getProperty("gru_keep_prob")) self.hidden_size_lstm = int( config_file.getProperty("hidden_size_lstm")) self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1")) #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2") self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers")) self.char_embeddings_size = int( config_file.getProperty("char_embeddings_size")) self.hidden_size_char = int( config_file.getProperty("hidden_size_char")) self.label_embeddings_size = int( config_file.getProperty("label_embeddings_size")) self.alpha = float(config_file.getProperty("alpha")) self.lmcost_lstm_gamma = float( config_file.getProperty("lmcost_lstm_gamma")) self.lmcost_joint_lstm_gamma = float( config_file.getProperty("lmcost_joint_lstm_gamma")) self.lmcost_hidden_layer_size = float( config_file.getProperty("lmcost_hidden_layer_size")) self.lmcost_max_vocab_size = int( config_file.getProperty("lmcost_max_vocab_size")) self.gru_size = int(config_file.getProperty("gru_size")) self.pos_num = int(config_file.getProperty("pos_num")) self.pos_size = int(config_file.getProperty("pos_size")) self.num_heads = int(config_file.getProperty("num_heads")) self.weight_b = float(config_file.getProperty("weight_b")) # evaluation self.evaluation_method = config_file.getProperty("evaluation_method") self.root_node = bool(config_file.getProperty("root_node")) self.shuffle = False self.batchsize = 1
def __init__(self,fname): config_file=parsers.read_properties(fname) #print("\nConfiguration file {} loaded \n".format(fname)) self.config_fname=fname # load data self.pretrained_embeddings=utils.strToBool(config_file.getProperty("pretrained_embeddings")) self.filename_embeddings = config_file.getProperty("filename_embeddings") #print(os.path.basename(self.filename_embeddings)) name_of_embeddings = "" self.embeddings_size=int(config_file.getProperty("embeddings_size")) self.word_to_ix={} if self.pretrained_embeddings==True: name_of_embeddings = "_"+os.path.basename(self.filename_embeddings) if os.path.isfile(self.filename_embeddings+".pkl")==False: self.wordvectors, self.embeddings_size, self.word_to_ix = utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) joblib.dump(( self.wordvectors, self.embeddings_size, self.word_to_ix), self.filename_embeddings+".pkl") else: self.wordvectors, self.embeddings_size, self.word_to_ix = joblib.load(self.filename_embeddings + ".pkl") # loading is faster self.filename_train=config_file.getProperty("filename_train") self.filename_dev = config_file.getProperty("filename_dev") self.filename_test=config_file.getProperty("filename_test") self.pad_length = int(config_file.getProperty("pad_length")) ''' train = reader.BinDataset(self.filename_train, isTrain=True, pretrained_embeddings=self.pretrained_embeddings, word_to_ix=self.word_to_ix,pad_length=self.pad_length) self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries() dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) ''' if os.path.isfile(self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: train = reader.BinDataset(self.filename_train,isTrain=True,pretrained_embeddings=self.pretrained_embeddings,word_to_ix=self.word_to_ix,pad_length=self.pad_length) joblib.dump(train, self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: train = joblib.load(self.filename_train+name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries() if os.path.isfile(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) joblib.dump(dev, self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: dev = joblib.load(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster if os.path.isfile(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) joblib.dump(test, self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: test = joblib.load(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster print (train) self.train_loader = DataLoader(train, batch_size=1, shuffle=False) self.dev_loader = DataLoader(dev, batch_size=1, shuffle=False) self.test_loader = DataLoader(test, batch_size=1, shuffle=False) print () #self.dev_id_docs = parsers.readHeadFile( self.filename_dev) #self.test_id_docs = parsers.readHeadFile(self.filename_test) # get labels for the whole collection #dataset_documents = [] #dataset_documents.extend(self.train_id_docs) #dataset_documents.extend(self.dev_id_docs) #dataset_documents.extend(self.test_id_docs) #self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents) #self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments(dataset_documents) #self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents) #print (len(self.dataset_set_characters)) #print(len(self.dataset_set_bio_tags)) #print((self.dataset_set_characters)) # print((self.dataset_set_bio_tags)) # training self.nepochs = int(config_file.getProperty("nepochs")) self.optimizer = config_file.getProperty("optimizer") #self.activation =config_file.getProperty("activation") self.learning_rate =float(config_file.getProperty("learning_rate")) #self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv")) self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout")) self.use_BIO_LSTM = utils.strToBool(config_file.getProperty("use_BIO_LSTM")) self.ner_loss = config_file.getProperty("ner_loss") self.ner_classes = config_file.getProperty("ner_classes") self.bin_features = config_file.getProperty("bin_features").lower() self.tweet_representation = config_file.getProperty("tweet_representation").lower() self.non_linearity_bin_features = config_file.getProperty("non_linearity_bin_features").lower() try: self.threshold = float(config_file.getProperty("threshold")) except: self.threshold=0 # hyperparameters self.n_filters = int(config_file.getProperty("n_filters")) self.filter_sizes = utils.strToLst(config_file.getProperty("filter_sizes")) self.batch_norm = utils.strToBool(config_file.getProperty("batch_norm")) self.cnn_pool = config_file.getProperty("cnn_pool").lower() self.dropout_cnn = float(config_file.getProperty("dropout_cnn")) self.bin_representation = config_file.getProperty("bin_representation").lower() self.dropout_lstm1_output = float(config_file.getProperty("dropout_lstm1_output")) self.dropout_embedding = float(config_file.getProperty("dropout_embedding")) #self.dropout_lstm = float(config_file.getProperty("dropout_lstm")) self.dropout_lstm2_output = float(config_file.getProperty("dropout_lstm2_output")) self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner")) self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel")) #self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm")) self.hidden_dim = int(config_file.getProperty("hidden_dim")) #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2") self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers")) #self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size")) #self.hidden_size_char = int(config_file.getProperty("hidden_size_char")) #self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size")) #self.alpha = float(config_file.getProperty("alpha")) # evaluation self.evaluation_method =config_file.getProperty("evaluation_method") #self.root_node=bool(config_file.getProperty("root_node")) self.shuffle=False
print "config:" for c in config: print str(c) + "\t" + str(config[c]) datafile = config["file"] fp = open(datafile + "_indexMapping", 'rb') sentId2newIndex2oldIndex = pickle.load(fp) fp.close() iterationSeed = -1 if "iterationSeed" in config: iterationSeed = int(config["iterationSeed"]) print "using " + str(iterationSeed) + " as seed for iteration scheme" pretrainedEmbeddings = False if "wordvectors" in config: wordvectorfile = config["wordvectors"] wordvectors, representationsize, words = readWordvectorsNumpy( wordvectorfile, isWord2vec=True) vocabsize = wordvectors.shape[0] pretrainedEmbeddings = True else: print "you have to specify a wordvector file in the config" exit() networkfile = config["net"] lrateOrig = float(config["lrate"]) print "using sgd with learning rate ", lrateOrig batch_size = int(config["batchsize"]) contextsize = int(config["contextsize"]) entitysize = int(config["entitysize"]) myLambda1 = 0 if "lambda1" in config: myLambda1 = float(config["lambda1"]) myLambda2 = 0
def __init__(self,fname): config_file=parsers.read_properties(fname) #print("\nConfiguration file {} loaded \n".format(fname)) self.config_fname=fname # load data self.filename_embeddings = config_file.getProperty("filename_embeddings") self.filename_train=config_file.getProperty("filename_train") self.filename_test=config_file.getProperty("filename_test") self.filename_dev=config_file.getProperty("filename_dev") self.train_id_docs = parsers.readHeadFile(self.filename_train) ## list of headIdDoc class self.dev_id_docs = [] # parsers.readHeadFile( self.filename_dev) ## list of headIdDoc class self.test_id_docs = [] # parsers.readHeadFile(self.filename_test) ## list of headIdDoc class # get labels for the whole collection dataset_documents = [] dataset_documents.extend(self.train_id_docs) dataset_documents.extend(self.dev_id_docs) dataset_documents.extend(self.test_id_docs) self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents) self.dataset_set_bio_tags = utils.getEntitiesFromDocuments(dataset_documents) ## get BIO tags. 'B-AE', 'I-AE', 'O', 'B-D', 'I-D' self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents) ## get relations. 'Kill' self.dataset_set_bio_relation_ners = utils.getRelationNersFromDocuments(dataset_documents) ## get relation-ner set. for example 'Kill__B-Peop', 'Kill__I-Peop' if os.path.isfile(self.filename_embeddings+".pkl")==False: self.wordvectors, self.representationsize, self.words, self.wordindices = utils.readWordvectorsNumpy(dataset_documents, self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) # self.wordindices = utils.readIndices(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) joblib.dump((self.wordvectors, self.representationsize, self.words,self.wordindices), self.filename_embeddings+".pkl") else: self.wordvectors, self.representationsize, self.words,self.wordindices = joblib.load(self.filename_embeddings + ".pkl") # loading is faster parsers.preprocess(self.train_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners) parsers.preprocess(self.dev_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners) parsers.preprocess(self.test_id_docs, self.wordindices, self.dataset_set_characters, self.dataset_set_bio_tags, self.dataset_set_relations, self.dataset_set_bio_relation_ners) # training self.nepochs = int(config_file.getProperty("nepochs")) self.optimizer = config_file.getProperty("optimizer") self.activation =config_file.getProperty("activation") self.learning_rate =float(config_file.getProperty("learning_rate")) self.gradientClipping = utils.strToBool(config_file.getProperty("gradientClipping")) self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv")) self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout")) self.ner1_loss = config_file.getProperty("ner1_loss") self.ner2_loss = config_file.getProperty("ner2_loss") self.ner_classes = config_file.getProperty("ner_classes") self.use_chars = utils.strToBool(config_file.getProperty("use_chars")) self.use_adversarial = utils.strToBool(config_file.getProperty("use_adversarial")) # hyperparameters self.dropout_embedding = float(config_file.getProperty("dropout_embedding")) self.dropout_lstm = float(config_file.getProperty("dropout_lstm")) self.dropout_lstm_output = float(config_file.getProperty("dropout_lstm_output")) self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner")) self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel")) self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm")) self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1")) #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2") self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers")) self.num_heads = int(config_file.getProperty("num_heads")) self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size")) self.hidden_size_char = int(config_file.getProperty("hidden_size_char")) self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size")) self.attention_size = int(config_file.getProperty("attention_size")) self.alpha = float(config_file.getProperty("alpha")) # evaluation self.evaluation_method =config_file.getProperty("evaluation_method") self.root_node=utils.strToBool(config_file.getProperty("root_node")) self.shuffle=False self.batchsize=1