def preprocess(labeled_dir,save_path): data = DataLoader.get_all_data(labeled_dir)[2] result ={} for name,content in data.items(): labeled_content = DataLoader.labeled_text(content) for sentences,label in labeled_content.items(): if label.__len__() ==0: label = "0" else: label = 1 if label not in result: result[label]=[] result[label].append(sentences) return result
def transfer2arff_simple(data, tfidfs): labeled_data = dataloader.labeled_text(data) arff_string = "" for sentence in labeled_data.keys(): if sentence.strip().__len__() == 0: continue features = analysis_single_sentences_simple(sentence, tfidfs) sentence_feature = "" for feature in features: if isinstance(features, list): for value in feature: sentence_feature += str(value) + "," else: sentence_feature += str(feature) + "," print(sentence_feature) if labeled_data[sentence].__len__() > 0: arff_string += sentence_feature + "1\n" else: arff_string += sentence_feature + "0\n" arff_string = arff_string.replace(" (", ", ") arff_string = arff_string.replace(") [[", ", ") arff_string = arff_string.replace("], [", ", ") arff_string = arff_string.replace("]] ", ", ") arff_string = arff_string.replace("]]", ", ") arff_string = arff_string.replace("[[", ", ") arff_string = arff_string.replace("(", ", ") arff_string = arff_string.replace(")", ", ") return arff_string
def tsne_for_plot(training_file, result_file, trusted_file, feature_path): import os f1 = open(training_file) lines1 = f1.readlines() f1.close() f2 = open(result_file) lines2 = f2.readlines() f2.close() f3 = open(trusted_file) lines3 = f3.readlines() f3.close() labels1 = [] labels2 = [] labels3 = [] features = [] ids = [] for i in range(1, len(lines1)): id1 = lines1[i].split('\t')[0].replace('.jpg', '') id2 = lines2[i].split('\t')[0] label1 = lines1[i].split('\t')[-2] label2 = lines2[i].split('\t')[-2] if id1 != id2: print('error', lines1[i], '!=', lines2[i]) labels1.append(1 if label1 == '是' else 0) labels2.append(1 if label2 == '是' else 0) features.append( DataLoader.get_feature(os.path.join(feature_path, id1 + '.npy'))) ids.append(id1) trusted_num = len(lines3) - 1 train_num = len(lines1) - 1 for i in range(1, len(lines3)): id3 = lines3[i].split('\t')[0].replace('.jpg', '') label3 = lines3[i].split('\t')[-2] labels3.append(3 if label3 == '是' else 2) features.append( DataLoader.get_feature(os.path.join(feature_path, id3 + '.npy'))) ids.append(id3) features = np.array(features) tsne = TSNE(verbose=True) position = tsne.fit_transform(features) np.save('position.npy', position) np.save('feature.npy', features) np.save('ids.npy', np.array(ids))
def infer(model, fnImg, f): "recognize text in image provided by file path" img = SamplePreprocessor.preprocess(cv2.imread(fnImg, cv2.IMREAD_GRAYSCALE), Model.Model.imgSize) import numpy as np np.shape(img) batch = DataLoader.Batch(None, [img]) print(np.shape(img)) (recognized, probability) = model.inferBatch(batch, True) out_name='new1/check%s'%FilePaths.index + '+%s.png'%recognized[0] img1=cv2.imread(fnImg) cv2.imwrite(out_name,img1) print('Recognized:', '"' + recognized[0] + '"') f.write(recognized[0] + " ")
def transfer2_arff(data): labeled_data = dataloader.labeled_text(data) arff_string ="@ summarization\n@ attribute num\n@ attribute avelen\n" \ "@ attribute entrys\n@attribute num_noun\n@ num of verb\n" \ "@attribute tfidf\n@data\n" for sentence in labeled_data.keys(): features = analysis_single_sentences(sentence) sentence_feature = "" for feature in features: if isinstance(features, list): for value in feature: sentence_feature += str(value) + " " else: sentence_feature += str(feature) + " " for label in labeled_data[sentence]: arff_string += sentence + label + "\n" return arff_string
def simple_data(data, save_path=Dir.resourceDir + "corpus/combine.txt"): result = {} save_content = "" for name, content in data.items(): labeled_content = dataloader.labeled_text(content) for sentences, label in labeled_content.items(): if label.__len__() == 0: label = "0" else: label = "1" if label not in result: result[label] = [] sentences = sentences.strip() if sentences == "": continue result[label].append(sentences) save_content += sentences + "\t" + label + "\n" Tools.saveIntoCsv(save_path, "sentence", "tag") return result
def demo_analysis(): dir_classic = Dir.resourceDir + "已标注文书-txt/基础案例299篇-已标注/" print(dir_classic) data = dataloader.get_all_data(dir_classic)[1] save_dir = Dir.projectDir + "/analysis_result/" for label in data.keys(): print(label) result = analsis_single_label_data(data[label]) content = " num,average,noun_num,verb_num\n" content += str(result[:-1]) + "\n" for i in range(result[-1].__len__()): content += str(result[-1][i][0] + "," + str(result[-1][i][1])) + "\n" # for word in result[-1]: # content += str(word)+","+str(result[-1][word])+"\n" if "/" in label: label = label.replace("/", "-") save_file = save_dir + label + ".txt" save_result(content, save_file)
def demo_arff_simple(dir_classic, save_dir): print(dir_classic) tmp = dataloader.get_all_data(dir_classic) data = tmp[2] data2 = tmp[1] arff_title = "" for i in range(105): arff_title += "attr" + str(i) + "," arff_title += "class\n" arff_content = "" count = 0 data2 = simple_data(data) tfidfs, possibile = build_tfidf(data2) for name in data.keys(): print(count, "/", data.__len__()) count += 1 arff_content += transfer2arff_simple(data[name], tfidfs) arff_content = arff_title + arff_content Tools.write(save_dir, arff_content) print("transfer completet, saved in:", save_dir)
for item in data: if item['flag'] == 0: f.write('\t'.join([ item['id'], item['frame'], '默认分类', item['label_name'], item['other'] ]) + '\n') else: f.write('\t'.join([ item['id'], item['frame'], item['label_name'], '是' if item['flag'] == 1 else '否', item['other'] ]) + '\n') f.close() def save_as_json(path, data): """ # TODO add some description for this function :param path: :param data: :return: """ # TODO finish the code for this function pass if __name__ == '__main__': from src import DataLoader flag, dataset = DataLoader.load_from_txt("../data/data1.txt") save_as_txt("../result/result1.txt")
def __init__(self): self.data = DataLoader() self.data.initialize()
class DataProcessor: def __init__(self): self.data = DataLoader() self.data.initialize() #returns current date def get_current_date(self): return self.get_list_of_dates()[0] # returns list of dates in range (current_date_time, 5_days_later_date_time) with a step of 3 hours def get_list_of_dates_and_time(self): date_list = [] for i in range(0, len(self.data.city_info)): date_list.append(self.data.city_info["Warszawa"]["list"][i]['dt_txt']) return date_list def get_list_of_dates(self): date_list = [] p_list = self.get_list_of_dates_and_time() previous = "" for datetime in p_list: d, t = datetime.split() if previous == "" or not previous == d: date_list.append(d) return list(set(date_list)) #returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,... def get_temperature_dict(self, city_name): temperature_list = {} for i in range(0,len(self.data.city_info)): temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] =\ self.data.city_info[city_name]["list"][i]['main']['temp'] return temperature_list def get_temperature_list_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_temperature_dict(city_name).items(): if date in k: tmp_list.append(v) return tmp_list # returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,... def get_pressure_dict(self, city_name): temperature_list = {} for i in range(0, len(self.data.city_info)): temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \ self.data.city_info[city_name]["list"][i]['main']['pressure'] return temperature_list def get_pressure_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_pressure_dict(city_name).items(): if date in k: tmp_list.append(v) return "{0:.2f}".format(mean(tmp_list)) # returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,... def get_sea_level_dict(self, city_name): temperature_list = {} for i in range(0, len(self.data.city_info)): temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \ self.data.city_info[city_name]["list"][i]['main']['sea_level'] return temperature_list def get_sea_level_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_sea_level_dict(city_name).items(): if date in k: tmp_list.append(v) return "{0:.2f}".format(mean(tmp_list)) #returns dicitionary: {'2019-01-26 21:00:00': 88, '2019-01-27 00:00:00': 92, '2019-01-27 03:00:00': 92,... def get_humidity_dict(self, city_name): humidity_list = {} for i in range(0,len(self.data.city_info)): humidity_list[self.data.city_info[city_name]["list"][i]['dt_txt']] =\ self.data.city_info[city_name]["list"][i]['main']['humidity'] return humidity_list def get_humidity_list_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_humidity_dict(city_name).items(): if date in k: tmp_list.append(v) return tmp_list #returns dictionary: {'2019-01-26 21:00:00': {'speed': 2.22, 'deg': 126.001}, '2019-01-27 00:00:00': {'speed': 2.07, 'deg': 119.001},.. def get_wind_dict(self, city_name): wind_list = {} for i in range(0, len(self.data.city_info)): wind_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \ self.data.city_info[city_name]["list"][i]['wind'] return wind_list def get_wind_list(self, city_name): tmp_list = [] for k, v in self.get_wind_dict(city_name).items(): tmp_list.append(v['speed']) return tmp_list def get_wind_list_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_wind_dict(city_name).items(): if date in k: tmp_list.append(v) return tmp_list def get_wind_strength_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_wind_dict(city_name).items(): if date in k: tmp_list.append(v['speed']) return tmp_list #returns dicitionary with pairs {'date_time', 'cloudiness_%'} def get_cloudiness_dict(self, city_name): cloud_list = {} for i in range(0, len(self.data.city_info)): cloud_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \ self.data.city_info[city_name]["list"][i]['clouds']['all'] return cloud_list def get_cloudiness_list_for_date(self, date, city_name): tmp_list = [] for k, v in self.get_cloudiness_dict(city_name).items(): if date in k: tmp_list.append(v) return tmp_list #returns weather description def get_weather_description_dict(self, city_name): desc_list = {} for i in range(0, len(self.data.city_info)): desc_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \ self.data.city_info[city_name]["list"][i]['weather'][0]['description'] return desc_list def get_weather_list_for_date(self, date, city_name): tmp_list = dict() for k, v in self.get_weather_description_dict(city_name).items(): if date in k: tmp_list[k] = v return tmp_list def convert_to_float(self, list_to_convert): tmp_list = [] for v in list(list_to_convert): tmp_list.append(float(v)) return tmp_list def get_weather_stat_string_for_date(self, city_name, date): max_temp = max(self.convert_to_float(self.get_temperature_list_for_date(date, city_name))) min_temp = min(self.convert_to_float(self.get_temperature_list_for_date(date, city_name))) avg_temp = mean(self.convert_to_float(self.get_temperature_list_for_date(date, city_name))) wind = mean(self.convert_to_float(self.get_wind_strength_for_date(date, city_name))) gusts = mean(self.convert_to_float(self.get_wind_strength_for_date(date, city_name))) clouds = mean(self.convert_to_float(self.get_cloudiness_list_for_date(date, city_name))) a = "" a += "Date: " + date + "\n" a += "Maximum temperature: " + "{0:.1f}".format(max_temp) + "°C\n" a += "Minimum temperature: " + "{0:.1f}".format(min_temp) + "°C\n" a += "Average temperature: " + "{0:.1f}".format(avg_temp) + "°C\n" a += "Wind: " + "{0:.2f}".format(wind) + "m/s\n" a += "Gusts of wind: " + "{0:.2f}".format(gusts) + "m/s\n" a += "Cloudiness: " + "{0:.1f}".format(clouds) + "%\n" return a def get_xticks(self): tab = [] for val in self.get_list_of_dates(): k = val.split('-') k = k[2] + "." + k[1] tab.append(k) tab.append("") tab.append("") tab.append("") tab.append("") tab.append("") tab.append("") tab.append("") return tab def get_additional_info_string(self, city_name, date): a = "" a += "Pressure: " + str(self.get_pressure_for_date(date, city_name)) + "hPa\n" a += "Sea level: " + str(self.get_sea_level_for_date(date, city_name)) + "hPa\n" a += "City info:\n" a += " City: " + str(self.data.city_info[city_name]['city']['name']) + "\n" a += " Country: " + str(self.data.city_info[city_name]['city']['country']) + "\n" a += " Latitude: " + str(self.data.city_info[city_name]['city']['coord']['lat']) + "\n" a += " Longtitude: " + str(self.data.city_info[city_name]['city']['coord']['lon']) + "\n" return a def get_weather_desc_string_for_date(self, city_name, date): weather = "" for k, v in self.get_weather_list_for_date(date, city_name).items(): weather += k.split()[1] + ": " + v + "\n" return weather
def run_duti(dataset_path, trusted_item_path, feature_dir, result_path): """ Run the duti algorithm to correct the label of training items. :param dataset_path: the path of the training dataset. :param trusted_item_path: the path of the trusted items. :param result_path: the path to save the result. :return: this function return nothing. """ # load data from a specified txt file. print('Loading dataset...') flag, dataset = DataLoader.load_from_txt(dataset_path) if flag != 'Train': print('warning: the ' + dataset_path + ' is not a train file.') print('Number of training data:', len(dataset)) print('Loading trusted items...') flag, trusted_items = DataLoader.load_from_txt(trusted_item_path) if flag != 'Trusted': print('warning: the ' + trusted_item_path + ' is not a trusted file.') print('Number of trusted items:', len(trusted_items)) print('Merging dataset and trusted items...') data, labelnames, flag = DataLoader.merge_dataset_and_trusted_items( dataset, trusted_items) # use duti to fix each dataset if flag: num_class = len(labelnames) duti = GreedyDUTI(num_class=num_class, max_iter=20) # init input for the duti algorithm print('Loading features for training data...') feature = [ DataLoader.get_feature(os.path.join(feature_dir, id + '.npy')) for id in data['ids'] ] print('Loading features for trusted items...') trusted_feature = [ DataLoader.get_feature(os.path.join(feature_dir, id + '.npy')) for id in data['trusted_ids'] ] confidence = np.ones_like(data['trusted_labels']) np.savez(dataset_path.replace('Train', 'Compressed').replace('.txt', '.npz'), feature=np.array(feature), label=np.array(data['labels']), trusted_feature=np.array(trusted_feature), trusted_label=np.array(data['trusted_labels'])) print('Running the duti algorithm...') bugs, delta, rankings = duti.fit_transform( np.array(feature), np.array(data['labels']), np.array(trusted_feature), np.array(data['trusted_labels']), confidence) y_debug = np.array(data['labels'], copy=True) clean_bug_y = np.argmax(delta[bugs, :], axis=1) y_debug[bugs] = clean_bug_y np.savez(result_path.replace('Result', 'Compressed').replace('.txt', '.npz'), y_debug=np.array(y_debug), delta=np.array(delta), rankings=np.array(rankings)) result = [] for i, id in enumerate(data['ids']): if id in data['trusted_ids']: result.append({ 'id': id, 'frame': data['frames'][i], 'label_name': labelnames[data['trusted_labels'][i]], 'flag': 0, 'other': data['others'][i] }) else: result.append({ 'id': id, 'frame': data['frames'][i], 'label_name': labelnames[y_debug[i]], 'flag': 0, 'other': data['others'][i] }) print('Saving the result...') ResultSaver.save_as_txt(result_path, result) else: num_class = 2 duti = GreedyDUTI(num_class=num_class, max_iter=20, method='decision tree') result = [] for index, dataset in enumerate(data): # init input for the duti algorithm print('Loading features for training data...') feature = [ DataLoader.get_feature(os.path.join(feature_dir, id + '.npy')) for id in dataset['ids'] ] print('Loading features for trusted items...') trusted_feature = [ DataLoader.get_feature(os.path.join(feature_dir, id + '.npy')) for id in dataset['trusted_ids'] ] confidence = np.ones_like(dataset['trusted_labels']) np.savez(dataset_path.replace('Train', 'Compressed').replace( '.txt', str(index) + '.npz'), feature=np.array(feature), label=np.array(dataset['labels']), trusted_feature=np.array(trusted_feature), trusted_label=np.array(dataset['trusted_labels'])) print('Running the duti algorithm...') bugs, delta, rankings = duti.fit_transform( np.array(feature), np.array(dataset['labels']), np.array(trusted_feature), np.array(dataset['trusted_labels']), confidence) print('Correct', bugs.sum(), 'bugs') y_debug = np.array(dataset['labels'], copy=True) clean_bug_y = np.argmax(delta[bugs, :], axis=1) y_debug[bugs] = clean_bug_y np.savez(result_path.replace('Result', 'Compressed').replace( '.txt', str(index) + '.npz'), y_debug=np.array(y_debug), delta=np.array(delta), rankings=np.array(rankings)) # for bug in range(len(bugs)): # if bugs[bug]: # print(dataset['ids'][bug], dataset['labels'][bug], 'to', y_debug[bug]) for i, id in enumerate(dataset['ids']): if id in dataset['trusted_ids']: result.append({ 'id': id, 'frame': dataset['frames'][i], 'label_name': labelnames[index], 'flag': dataset['trusted_labels'][dataset['trusted_ids'].index( id)] + 1, 'other': dataset['others'][i] }) else: result.append({ 'id': id, 'frame': dataset['frames'][i], 'label_name': labelnames[index], 'flag': y_debug[i] + 1, 'other': dataset['others'][i] }) print('Saving the result...') ResultSaver.save_as_txt(result_path, result) print('===finish correcting the dataset:', dataset_path, 'by', trusted_item_path, 'and save the result in', result_path, '===')
from src.Constants import * def get_loss(y_pred, y_true): # Calculate the loss from digits being incorrect. Don't count loss from # digits that are in non-present plates. loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=tf.reshape(y_pred, [-1, CLASSES]), labels=tf.reshape(y_true, [-1, CLASSES])) loss = tf.reshape(loss, [-1, TOTAL_CHARS]) loss = tf.reduce_sum(loss) return loss dataDirectory = os.getcwd() + '\\..\\data' dataLoader = DataLoader(dataDirectory) training_data, validation_data, test_data = dataLoader.load_data() mini_batchesX = [training_data[0][k:k + MINI_BATCH_SIZE] for k in range(0, len(training_data[0]), MINI_BATCH_SIZE)] mini_batchesY = [training_data[1][k:k + MINI_BATCH_SIZE] for k in range(0, len(training_data[1]), MINI_BATCH_SIZE)] x, y_pred, params = network.get_network() y_true = tf.placeholder(tf.float32, [None, TOTAL_CHARS * CLASSES]) digits_loss = get_loss(y_pred, y_true) train_step = tf.train.AdamOptimizer(1e-4).minimize(digits_loss)
def load_data(self): data_loader = DataLoader() train_data, label_data = data_loader.load_train_data() return train_data, label_data