def getTestingSet(self): df_test = read_csv_(self.test_file, names=None,delimiter=self.configs.delimiter) df_testr = read_csv_(self.testr_file, names=None,delimiter=self.configs.delimiter) if len(list(df_test.columns)) == 2: df_test.columns = ["token", "label"] df_testr.columns = ["token", "label"] df_test = df_test[["token"]] df_testr = df_testr[["label"]] elif len(list(df_test.columns)) == 1: df_test.columns = ["token"] df_testr.columns = ["token", "label"] df_testr = df_testr[["label"]] df_test["token_id"] = df_test.token.map(lambda x: self.mapFunc(x, self.token2id)) df_testr["label_id"] = df_testr.label.map(lambda x: -1 if str(x) == str(np.nan) else self.rule2id[x]) df_test["token"] = df_test.token.map(lambda x: -1 if str(x) == str(np.nan) else x) X_test_id, y_test_psyduo_label = self.prepare(df_test["token_id"], df_test["token_id"], return_psyduo_label=True) X_test_token, _ = self.prepare(df_test["token"], df_test["token"]) _, r_test = self.prepare(df_test["token_id"], df_testr["label_id"]) self.logger.info("\ntesting set size: %d\n" % (len(X_test_id))) return X_test_id, y_test_psyduo_label, X_test_token, r_test
def getValidingSet(self): df_val = read_csv_(self.dev_file, names=["token", "label"],delimiter=self.configs.delimiter) df_valr = read_csv_(self.devr_file, names=["token", "label"],delimiter=self.configs.delimiter) df_val["token_id"] = df_val.token.map(lambda x: self.mapFunc(x, self.token2id)) df_val["label_id"] = df_val.label.map(lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) df_valr["label_id"] = df_valr.label.map(lambda x: -1 if str(x) == str(np.nan) else self.rule2id[x]) X_val, y_val = self.prepare(df_val["token_id"], df_val["label_id"]) _, r_val = self.prepare(df_val["token_id"], df_valr["label_id"]) return X_val, y_val, r_val
def match_corr_diff(self,): ''' match corrected df and predicted df : param df: the originally predicted csv : param corr_df: the corrected csv ''' # read predicted and corrected results df_pred_ori = read_csv_(self.output_test_file, names=['id','token','label'], delimiter=self.configs.delimiter) df_corr_ori = read_csv_(self.corr_file, names=['id','token','label'], delimiter=self.configs.delimiter) df_pred = df_pred_ori.copy() df_corr = df_corr_ori.copy() df_pred['index1'] = df_pred.index df_corr['index1'] = df_corr.index # extract sen id in order to match sentences to train df_pred['url_sen_id'] = df_pred['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x) df_corr['url_sen_id'] = df_corr['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x) # extract the wrong prediction label df = df_corr[df_pred.label != df_corr.label] mat_id_sta_indexes = df_corr[df_corr.url_sen_id.isin (df.groupby("url_sen_id").first().index)].index df_corr_part = df_corr_ori.iloc[mat_id_sta_indexes ].reset_index()[['id','token','label']] df_corr_part['url_sen_id'] = df_corr_part['id'].map(lambda x: x.rsplit('_', 1)[0] if str(x) != str(np.nan) else x) # Let's create a row which we want to insert rows_num = len(df_corr_part) df_dict = df_corr_part.to_dict() # df_dict_no_index = {key: df_dict[key].values() for key, value in df_dict.items()} new_df_dict = {'id': [], 'token': [], 'label': []} for i in tqdm(range(rows_num), desc="Inserting sen dilimiter:"): if i < rows_num -1: # append the current item first new_df_dict['id'].append(df_corr_part['id'][i]) new_df_dict['token'].append(df_corr_part['token'][i]) new_df_dict['label'].append(df_corr_part['label'][i]) # append delimiter if id is not equal if df_corr_part['url_sen_id'][i] != df_corr_part['url_sen_id'][i+1]: new_df_dict['id'].append(np.nan) new_df_dict['token'].append(np.nan) new_df_dict['label'].append(np.nan) df_fin = pd.DataFrame.from_dict(new_df_dict) return df_fin
def build_new_token(self,): df_train = read_csv_(self.corr_file, names=["ord", "token", "label"], delimiter=self.configs.delimiter) # the new_tokens should smaller then ori tokens or just a subset of them new_tokens = list(set(df_train["token"][df_train["token"].notnull()])) if len(set(new_tokens) & set(list(self.token2id.keys()))) < len(new_tokens): # there are new tokens tokens = list(set(list(self.token2id.keys()) + new_tokens)) else: # it is a subset and exit return token2id = dict(zip(tokens, range(1, len(tokens) + 1))) id2token = dict(zip(range(1, len(tokens) + 1), tokens)) id2token[0] = self.PADDING token2id[self.PADDING] = 0 id2token[len(tokens) + 1] = self.UNKNOWN token2id[self.UNKNOWN] = len(tokens) + 1 with open(self.token2id_file, "w", encoding='utf-8') as outfile: for idx in id2token: outfile.write(id2token[idx] + "\t" + str(idx) + "\n") return token2id
def getTrainingSet(self, train_val_ratio=0.9): # df_train = read_csv_(self.train_file, names=["token", "label"], delimiter=self.configs.delimiter) df_train = read_csv_(self.train_file, names=["token", "label"], delimiter=self.configs.delimiter) # map the token and label into id df_train["token_id"] = df_train.token.map(lambda x: -1 if str(x) == str(np.nan) else self.token2id[x]) df_train["label_id"] = df_train.label.map(lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) # convert the data in maxtrix X, y = self.prepare_train(df_train["token_id"], df_train["label_id"]) # shuffle the samples num_samples = len(X) indexs = np.arange(num_samples) np.random.shuffle(indexs) X = X[indexs] y = y[indexs] if self.dev_file != None: X_train = X y_train = y X_val, y_val = self.getValidingSet() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train = y[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] self.logger.info("\ntraining set size: %d, validating set size: %d\n" % (len(X_train), len(y_val))) return X_train, y_train, X_val, y_val
def getTrainingSet(self, train_val_ratio=0.9): df_train = read_csv_(self.train_file, names=["token", "label1", "label2", "label3"], delimiter=self.configs.delimiter) # map the token and label into id df_train["token_id"] = df_train.token.map( lambda x: -1 if str(x) == str(np.nan) else self.token2id[x]) df_train["label1_id"] = df_train.label1.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) df_train["label2_id"] = df_train.label2.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) df_train["label3_id"] = df_train.label3.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) # convert the data in maxtrix X, y1 = self.prepare(df_train["token_id"], df_train["label1_id"]) _, y2 = self.prepare(df_train["token_id"], df_train["label2_id"]) _, y3 = self.prepare(df_train["token_id"], df_train["label3_id"]) # shuffle the samples num_samples = len(X) indexs = np.arange(num_samples) np.random.shuffle(indexs) X = X[indexs] y1 = y1[indexs] y2 = y2[indexs] y3 = y3[indexs] if self.dev_file != None: X_train = X y_train1 = y1 y_train2 = y2 y_train3 = y3 X_val, y_val1, y_val2, y_val3 = self.getValidingSet() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train1 = y1[:int(num_samples * train_val_ratio)] y_train2 = y2[:int(num_samples * train_val_ratio)] y_train3 = y3[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val1 = y1[int(num_samples * train_val_ratio):] y_val2 = y2[int(num_samples * train_val_ratio):] y_val3 = y3[int(num_samples * train_val_ratio):] self.logger.info("\ntraining set size: %d, validating set size: %d\n" % (len(X_train), len(X_val))) return X_train, y_train1, y_train2, y_train3, X_val, y_val1, y_val2, y_val3
def buildVocab(self, train_path, trainr_path): df_train = read_csv_(train_path, names=["token", "label"],delimiter=self.configs.delimiter) df_trainr = read_csv_(trainr_path, names=["token", "rule"],delimiter=self.configs.delimiter) tokens = list(set(df_train["token"][df_train["token"].notnull()])) labels = list(set(df_train["label"][df_train["label"].notnull()])) rules = list(set(df_trainr["rule"][df_trainr["rule"].notnull()])) rule2id = dict(zip(rules, range(1, len(rules) + 1))) token2id = dict(zip(tokens, range(1, len(tokens) + 1))) label2id = dict(zip(labels, range(1, len(labels) + 1))) id2rule = dict(zip(range(1, len(rules) + 1), rules)) id2token = dict(zip(range(1, len(tokens) + 1), tokens)) id2label = dict(zip(range(1, len(labels) + 1), labels)) id2rule[0] = self.PADDING id2token[0] = self.PADDING id2label[0] = self.PADDING rule2id[self.PADDING] = 0 token2id[self.PADDING] = 0 label2id[self.PADDING] = 0 id2token[len(tokens) + 1] = self.UNKNOWN token2id[self.UNKNOWN] = len(tokens) + 1 self.saveVocab(id2token, id2label, id2rule) return token2id, id2token, label2id, id2label, rule2id, id2rule
def getTestingrealY_str(self): df_test = read_csv_(self.test_file, names=None, delimiter=self.configs.delimiter) if len(list(df_test.columns)) == 2: df_test.columns = ["token", "label"] df_test = df_test[["label"]] elif len(list(df_test.columns)) == 1: df_test.columns = ["label"] df_test["label_id"] = df_test.label.map(lambda x: self.mapFuncLable(x, self.label2id)) df_test["label"] = df_test.label.map(lambda x: -1 if str(x) == str(np.nan) else x) Y_test_id, y_test_psyduo_label = self.prepare(df_test["label_id"], df_test["label_id"], return_psyduo_label=True) Y_test_label, _ = self.prepare(df_test["label"], df_test["label"]) # self.logger.info("\ntesting set size: %d\n" % (len(Y_test_id))) return Y_test_id, y_test_psyduo_label, Y_test_label
def getValidingSet(self): df_val = read_csv_(self.dev_file, names=["token", "label1", "label2", "label3"], delimiter=self.configs.delimiter) df_val["token_id"] = df_val.token.map( lambda x: self.mapFunc(x, self.token2id)) df_val["label1_id"] = df_val.label1.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) df_val["label2_id"] = df_val.label2.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) df_val["label3_id"] = df_val.label3.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) X_val, y_val1 = self.prepare(df_val["token_id"], df_val["label1_id"]) _, y_val2 = self.prepare(df_val["token_id"], df_val["label2_id"]) _, y_val3 = self.prepare(df_val["token_id"], df_val["label3_id"]) return X_val, y_val1, y_val2, y_val3
def getTestingSet(self): df_test = read_csv_(self.test_file, names=None, delimiter=self.configs.delimiter) if len(list(df_test.columns)) == 3: df_test.columns = ["id", "token", "label"] elif len(list(df_test.columns)) == 2: df_test.columns = ["id", "token"] ids = df_test['id'].tolist() df_test = df_test[["token"]] df_test["token_id"] = df_test.token.map(lambda x: self.mapFunc(x, self.token2id)) df_test["token"] = df_test.token.map(lambda x: -1 if str(x) == str(np.nan) else x) X_test_id, y_test_psyduo_label = self.prepare(ids, df_test["token_id"], df_test["token_id"], return_psyduo_label=True) hash_ids, X_test_token, _ = self.prepare(ids, df_test["token"], df_test["token"]) self.logger.info("\ntesting set size: %d\n" % (len(X_test_id))) return hash_ids, X_test_id, y_test_psyduo_label, X_test_token
def buildVocab(self, train_path): df_train = read_csv_(train_path, names=["token", "label1", "label2", "label3"], delimiter=self.configs.delimiter) tokens = list(set(df_train["token"][df_train["token"].notnull()])) labels1 = list(set(df_train["label1"][df_train["label1"].notnull()])) labels2 = list(set(df_train["label2"][df_train["label2"].notnull()])) labels3 = list(set(df_train["label3"][df_train["label3"].notnull()])) labels = list(set(labels1 + labels2 + labels3)) token2id = dict(zip(tokens, range(1, len(tokens) + 1))) label2id = dict(zip(labels, range(1, len(labels) + 1))) id2token = dict(zip(range(1, len(tokens) + 1), tokens)) id2label = dict(zip(range(1, len(labels) + 1), labels)) id2token[0] = self.PADDING id2label[0] = self.PADDING token2id[self.PADDING] = 0 label2id[self.PADDING] = 0 id2token[len(tokens) + 1] = self.UNKNOWN token2id[self.UNKNOWN] = len(tokens) + 1 self.saveVocab(id2token, id2label) return token2id, id2token, label2id, id2label