def createDics(self, bin_dataframe,pretrained_embeddings): # bin_np = bin_dataframe.as_matrix() bin_np = bin_dataframe.to_numpy() if pretrained_embeddings==False: # maybe not need this! self.word_to_ix["<unk>"] = len(self.word_to_ix) #initialize the event dictionary self.event_to_ix["non-event"]= len(self.event_to_ix) self.event_to_ix["event"]= len(self.event_to_ix) # initialize the tags dictionary self.tag_to_ix["B-Other"] = len(self.tag_to_ix) self.tag_to_ix["I-Other"] = len(self.tag_to_ix) for line in bin_np: if line[1] != None: if pretrained_embeddings==True: continue else: for word in utils.strToLst(line[1]): if word not in self.word_to_ix: self.word_to_ix[word] = len(self.word_to_ix) else: tag = utils.strToLst(line[0])['corrected_tags'] if tag not in self.tag_to_ix: self.tag_to_ix[tag] = len(self.tag_to_ix) self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix) self.tag_to_ix = utils.getSegmentationDict(self.BIOset) self.ec_to_ix = utils.getSegmentationDict(self.ECset)
def __init__(self, file): docNr = -1 self.head_docs = [] tokens = headIdDoc("") for i in range(file.shape[0]): if '#doc' in file[i][0] or i == file.shape[ 0] - 1: # append all docs including the last one if (i == file.shape[0] - 1): # append last line tokens.append(int(file[i][0]), file[i][1], file[i][2], utils.strToLst(file[i][3]), utils.strToLst(file[i][4])) # append lines if (docNr != -1): self.head_docs.append(tokens) docNr += 1 tokens = headIdDoc(file[i][0]) else: tokens.append(int(file[i][0]), file[i][1], file[i][2], utils.strToLst(file[i][3]), utils.strToLst(file[i][4])) # append lines
def __init__(self,fname): config_file=parsers.read_properties(fname) #print("\nConfiguration file {} loaded \n".format(fname)) self.config_fname=fname # load data self.pretrained_embeddings=utils.strToBool(config_file.getProperty("pretrained_embeddings")) self.filename_embeddings = config_file.getProperty("filename_embeddings") #print(os.path.basename(self.filename_embeddings)) name_of_embeddings = "" self.embeddings_size=int(config_file.getProperty("embeddings_size")) self.word_to_ix={} if self.pretrained_embeddings==True: name_of_embeddings = "_"+os.path.basename(self.filename_embeddings) if os.path.isfile(self.filename_embeddings+".pkl")==False: self.wordvectors, self.embeddings_size, self.word_to_ix = utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) joblib.dump(( self.wordvectors, self.embeddings_size, self.word_to_ix), self.filename_embeddings+".pkl") else: self.wordvectors, self.embeddings_size, self.word_to_ix = joblib.load(self.filename_embeddings + ".pkl") # loading is faster self.filename_train=config_file.getProperty("filename_train") self.filename_dev = config_file.getProperty("filename_dev") self.filename_test=config_file.getProperty("filename_test") self.pad_length = int(config_file.getProperty("pad_length")) ''' train = reader.BinDataset(self.filename_train, isTrain=True, pretrained_embeddings=self.pretrained_embeddings, word_to_ix=self.word_to_ix,pad_length=self.pad_length) self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries() dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) ''' if os.path.isfile(self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: train = reader.BinDataset(self.filename_train,isTrain=True,pretrained_embeddings=self.pretrained_embeddings,word_to_ix=self.word_to_ix,pad_length=self.pad_length) joblib.dump(train, self.filename_train +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: train = joblib.load(self.filename_train+name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster self.word_to_ix, self.tag_to_ix, self.event_to_ix, self.ec_to_ix = train.getDictionaries() if os.path.isfile(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: dev = reader.BinDataset(self.filename_dev,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) joblib.dump(dev, self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: dev = joblib.load(self.filename_dev +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster if os.path.isfile(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") == False: test = reader.BinDataset(self.filename_test,isTrain=False, word_to_ix=self.word_to_ix, tag_to_ix=self.tag_to_ix, event_to_ix=self.event_to_ix,pad_length=self.pad_length) joblib.dump(test, self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") else: test = joblib.load(self.filename_test +name_of_embeddings+"_"+str(self.pad_length)+ "_tweet.pkl") # loading is faster print (train) self.train_loader = DataLoader(train, batch_size=1, shuffle=False) self.dev_loader = DataLoader(dev, batch_size=1, shuffle=False) self.test_loader = DataLoader(test, batch_size=1, shuffle=False) print () #self.dev_id_docs = parsers.readHeadFile( self.filename_dev) #self.test_id_docs = parsers.readHeadFile(self.filename_test) # get labels for the whole collection #dataset_documents = [] #dataset_documents.extend(self.train_id_docs) #dataset_documents.extend(self.dev_id_docs) #dataset_documents.extend(self.test_id_docs) #self.dataset_set_characters = utils.getCharsFromDocuments(dataset_documents) #self.dataset_set_bio_tags, self.dataset_set_ec_tags = utils.getEntitiesFromDocuments(dataset_documents) #self.dataset_set_relations = utils.getRelationsFromDocuments(dataset_documents) #print (len(self.dataset_set_characters)) #print(len(self.dataset_set_bio_tags)) #print((self.dataset_set_characters)) # print((self.dataset_set_bio_tags)) # training self.nepochs = int(config_file.getProperty("nepochs")) self.optimizer = config_file.getProperty("optimizer") #self.activation =config_file.getProperty("activation") self.learning_rate =float(config_file.getProperty("learning_rate")) #self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv")) self.use_dropout = utils.strToBool(config_file.getProperty("use_dropout")) self.use_BIO_LSTM = utils.strToBool(config_file.getProperty("use_BIO_LSTM")) self.ner_loss = config_file.getProperty("ner_loss") self.ner_classes = config_file.getProperty("ner_classes") self.bin_features = config_file.getProperty("bin_features").lower() self.tweet_representation = config_file.getProperty("tweet_representation").lower() self.non_linearity_bin_features = config_file.getProperty("non_linearity_bin_features").lower() try: self.threshold = float(config_file.getProperty("threshold")) except: self.threshold=0 # hyperparameters self.n_filters = int(config_file.getProperty("n_filters")) self.filter_sizes = utils.strToLst(config_file.getProperty("filter_sizes")) self.batch_norm = utils.strToBool(config_file.getProperty("batch_norm")) self.cnn_pool = config_file.getProperty("cnn_pool").lower() self.dropout_cnn = float(config_file.getProperty("dropout_cnn")) self.bin_representation = config_file.getProperty("bin_representation").lower() self.dropout_lstm1_output = float(config_file.getProperty("dropout_lstm1_output")) self.dropout_embedding = float(config_file.getProperty("dropout_embedding")) #self.dropout_lstm = float(config_file.getProperty("dropout_lstm")) self.dropout_lstm2_output = float(config_file.getProperty("dropout_lstm2_output")) self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner")) self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel")) #self.hidden_size_lstm =int(config_file.getProperty("hidden_size_lstm")) self.hidden_dim = int(config_file.getProperty("hidden_dim")) #self.hidden_size_n2 = config_file.getProperty("hidden_size_n2") self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers")) #self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size")) #self.hidden_size_char = int(config_file.getProperty("hidden_size_char")) #self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size")) #self.alpha = float(config_file.getProperty("alpha")) # evaluation self.evaluation_method =config_file.getProperty("evaluation_method") #self.root_node=bool(config_file.getProperty("root_node")) self.shuffle=False
def preprocess(self, bin_dataframe): # bin_np = bin_dataframe.as_matrix() bin_np = bin_dataframe.to_numpy() docNr = -1 bin_tweets = [] bin_tweet_lengths=[] bin_tweets_text=[] previous_match = "" match = [] for i in range(bin_np.shape[0]): if bin_np[i][1] == None or i == bin_np.shape[0] - 1: # append all docs including the last one if (i == bin_np.shape[0] - 1): # append last line tweet_text = utils.lstToString(utils.strToLst(bin_np[i][1])).split() tweet, tweet_length = utils.prepare_sequence( tweet_text, self.word_to_ix, pad_length=self.pad_length) bin_tweets.append(tweet) bin_tweet_lengths.append(tweet_length) bin_tweets_text.append(tweet_text) if (docNr != -1): #bin_tweets = np.asarray(bin_tweets) try: tag_id = self.tag_to_ix[target] if target.startswith("B-") or target.startswith("I-"): ec_id=self.ec_to_ix[target[2:]] else: ec_id=self.ec_to_ix[target] except: # print(target) if target.startswith("B-"): tag_id = self.tag_to_ix["B-Other"] elif target.startswith("I-"): tag_id = self.tag_to_ix["I-Other"] ec_id = self.ec_to_ix["Other"] if target=="O": event_duration_idx = self.event_to_ix["non-event"] else: event_duration_idx = self.event_to_ix["event"] if event_id==-1: independent_event_idx = self.event_to_ix["non-event"] else: independent_event_idx = self.event_to_ix["event"] #print (len(bin_tweets)) #print (torch.stack(bin_tweets)) match.append([torch.stack(bin_tweets), tag_id,ec_id,event_duration_idx,independent_event_idx,event_type,event_id,bin_tweet_lengths]) #print (utils.getDictionaryKeyByIdx(self.tag_to_ix,tag_id),utils.getDictionaryKeyByIdx(self.ec_to_ix,ec_id),utils.getDictionaryKeyByIdx(self.event_to_ix,event_id)) # match=np.append(match,bin_tokens) # match['match_bins'].append(bin) docNr += 1 if i != bin_np.shape[0] - 1: infoDict = utils.strToLst(bin_np[i][0]) # print('infoDict', infoDict) if previous_match != infoDict['doc']: # print (infoDict['doc']) # match = {'match_bins': np.empty((0)),"match_name": infoDict['doc']} previous_match = infoDict['doc'] # below two lines should be interchanged i think match = [] self.matches.append(match) bin_tweets = [] bin_tweet_lengths=[] bin_tweets_text=[] target = infoDict['corrected_tags'] event_type = infoDict['event_type'] event_id = infoDict['event_id'] match_name= infoDict['doc'] # {'bin': infoDict['bin'],'targets': infoDict['corrected_tags'],'tweets':[],'timestamps':[],'tokens':""} else: # bin['tweets'].append(strToLst(bin_np[i][1])) # bin_tokens+=" "+lstToString(strToLst(bin_np[i][1])) # bin['timestamps'].append(int(bin_np[i][0])) # print ((lstToString(strToLst(bin_np[i][1])).split())) #print (bin_tokens) tweet_text=utils.lstToString(utils.strToLst(bin_np[i][1])).split() tweet,tweet_length=utils.prepare_sequence(tweet_text, self.word_to_ix, pad_length=self.pad_length) bin_tweets.append(tweet) bin_tweet_lengths.append(tweet_length) bin_tweets_text.append(tweet_text)