コード例 #1
0
def preprocess(labeled_dir,save_path):
    data = DataLoader.get_all_data(labeled_dir)[2]
    result ={}
    for name,content in data.items():
        labeled_content = DataLoader.labeled_text(content)
        for sentences,label in labeled_content.items():
            if label.__len__() ==0:
                label = "0"
            else:
                label = 1
            if label not in result:
                result[label]=[]
            result[label].append(sentences)
    return result
コード例 #2
0
ファイル: Analysis.py プロジェクト: panzovor/LawSummarizor
def transfer2arff_simple(data, tfidfs):
    labeled_data = dataloader.labeled_text(data)
    arff_string = ""
    for sentence in labeled_data.keys():
        if sentence.strip().__len__() == 0:
            continue
        features = analysis_single_sentences_simple(sentence, tfidfs)
        sentence_feature = ""
        for feature in features:
            if isinstance(features, list):
                for value in feature:
                    sentence_feature += str(value) + ","
            else:
                sentence_feature += str(feature) + ","
        print(sentence_feature)
        if labeled_data[sentence].__len__() > 0:
            arff_string += sentence_feature + "1\n"
        else:
            arff_string += sentence_feature + "0\n"
        arff_string = arff_string.replace(" (", ", ")
        arff_string = arff_string.replace(") [[", ", ")
        arff_string = arff_string.replace("], [", ", ")
        arff_string = arff_string.replace("]] ", ", ")
        arff_string = arff_string.replace("]]", ", ")
        arff_string = arff_string.replace("[[", ", ")
        arff_string = arff_string.replace("(", ", ")
        arff_string = arff_string.replace(")", ", ")

    return arff_string
コード例 #3
0
ファイル: Helper.py プロジェクト: Lcorvle/DadaDebugging
def tsne_for_plot(training_file, result_file, trusted_file, feature_path):
    import os
    f1 = open(training_file)
    lines1 = f1.readlines()
    f1.close()
    f2 = open(result_file)
    lines2 = f2.readlines()
    f2.close()
    f3 = open(trusted_file)
    lines3 = f3.readlines()
    f3.close()
    labels1 = []
    labels2 = []
    labels3 = []
    features = []
    ids = []
    for i in range(1, len(lines1)):
        id1 = lines1[i].split('\t')[0].replace('.jpg', '')
        id2 = lines2[i].split('\t')[0]
        label1 = lines1[i].split('\t')[-2]
        label2 = lines2[i].split('\t')[-2]
        if id1 != id2:
            print('error', lines1[i], '!=', lines2[i])
        labels1.append(1 if label1 == '是' else 0)
        labels2.append(1 if label2 == '是' else 0)
        features.append(
            DataLoader.get_feature(os.path.join(feature_path, id1 + '.npy')))
        ids.append(id1)
    trusted_num = len(lines3) - 1
    train_num = len(lines1) - 1
    for i in range(1, len(lines3)):
        id3 = lines3[i].split('\t')[0].replace('.jpg', '')
        label3 = lines3[i].split('\t')[-2]
        labels3.append(3 if label3 == '是' else 2)
        features.append(
            DataLoader.get_feature(os.path.join(feature_path, id3 + '.npy')))
        ids.append(id3)

    features = np.array(features)
    tsne = TSNE(verbose=True)
    position = tsne.fit_transform(features)
    np.save('position.npy', position)
    np.save('feature.npy', features)
    np.save('ids.npy', np.array(ids))
コード例 #4
0
def infer(model, fnImg, f):
	"recognize text in image provided by file path"
	img = SamplePreprocessor.preprocess(cv2.imread(fnImg, cv2.IMREAD_GRAYSCALE), Model.Model.imgSize)
	import numpy as np
	np.shape(img)
	batch = DataLoader.Batch(None, [img])
	print(np.shape(img))
	(recognized, probability) = model.inferBatch(batch, True)
	out_name='new1/check%s'%FilePaths.index + '+%s.png'%recognized[0]
	img1=cv2.imread(fnImg)
	cv2.imwrite(out_name,img1)
	print('Recognized:', '"' + recognized[0] + '"')
	f.write(recognized[0] + " ")
コード例 #5
0
ファイル: Analysis.py プロジェクト: panzovor/LawSummarizor
def transfer2_arff(data):
    labeled_data = dataloader.labeled_text(data)
    arff_string ="@ summarization\n@ attribute num\n@ attribute avelen\n" \
                 "@ attribute entrys\n@attribute num_noun\n@ num of verb\n" \
                 "@attribute tfidf\n@data\n"
    for sentence in labeled_data.keys():
        features = analysis_single_sentences(sentence)
        sentence_feature = ""
        for feature in features:
            if isinstance(features, list):
                for value in feature:
                    sentence_feature += str(value) + " "
            else:
                sentence_feature += str(feature) + " "
        for label in labeled_data[sentence]:
            arff_string += sentence + label + "\n"
    return arff_string
コード例 #6
0
ファイル: Analysis.py プロジェクト: panzovor/LawSummarizor
def simple_data(data, save_path=Dir.resourceDir + "corpus/combine.txt"):
    result = {}
    save_content = ""
    for name, content in data.items():
        labeled_content = dataloader.labeled_text(content)
        for sentences, label in labeled_content.items():
            if label.__len__() == 0:
                label = "0"
            else:
                label = "1"
            if label not in result:
                result[label] = []
            sentences = sentences.strip()
            if sentences == "":
                continue
            result[label].append(sentences)
            save_content += sentences + "\t" + label + "\n"
    Tools.saveIntoCsv(save_path, "sentence", "tag")
    return result
コード例 #7
0
ファイル: Analysis.py プロジェクト: panzovor/LawSummarizor
def demo_analysis():
    dir_classic = Dir.resourceDir + "已标注文书-txt/基础案例299篇-已标注/"
    print(dir_classic)
    data = dataloader.get_all_data(dir_classic)[1]
    save_dir = Dir.projectDir + "/analysis_result/"
    for label in data.keys():
        print(label)
        result = analsis_single_label_data(data[label])
        content = " num,average,noun_num,verb_num\n"
        content += str(result[:-1]) + "\n"
        for i in range(result[-1].__len__()):
            content += str(result[-1][i][0] + "," +
                           str(result[-1][i][1])) + "\n"
        # for word in result[-1]:
        #     content += str(word)+","+str(result[-1][word])+"\n"
        if "/" in label:
            label = label.replace("/", "-")
        save_file = save_dir + label + ".txt"
        save_result(content, save_file)
コード例 #8
0
ファイル: Analysis.py プロジェクト: panzovor/LawSummarizor
def demo_arff_simple(dir_classic, save_dir):
    print(dir_classic)
    tmp = dataloader.get_all_data(dir_classic)
    data = tmp[2]
    data2 = tmp[1]
    arff_title = ""
    for i in range(105):
        arff_title += "attr" + str(i) + ","
    arff_title += "class\n"
    arff_content = ""
    count = 0
    data2 = simple_data(data)
    tfidfs, possibile = build_tfidf(data2)
    for name in data.keys():
        print(count, "/", data.__len__())
        count += 1
        arff_content += transfer2arff_simple(data[name], tfidfs)
    arff_content = arff_title + arff_content
    Tools.write(save_dir, arff_content)
    print("transfer completet, saved in:", save_dir)
コード例 #9
0
    for item in data:
        if item['flag'] == 0:
            f.write('\t'.join([
                item['id'], item['frame'], '默认分类', item['label_name'],
                item['other']
            ]) + '\n')
        else:
            f.write('\t'.join([
                item['id'], item['frame'], item['label_name'],
                '是' if item['flag'] == 1 else '否', item['other']
            ]) + '\n')
    f.close()


def save_as_json(path, data):
    """
    # TODO add some description for this function
    :param path:
    :param data:
    :return:
    """
    # TODO finish the code for this function
    pass


if __name__ == '__main__':
    from src import DataLoader
    flag, dataset = DataLoader.load_from_txt("../data/data1.txt")
    save_as_txt("../result/result1.txt")
コード例 #10
0
 def __init__(self):
     self.data = DataLoader()
     self.data.initialize()
コード例 #11
0
class DataProcessor:


    def __init__(self):
        self.data = DataLoader()
        self.data.initialize()

    #returns current date
    def get_current_date(self):
        return self.get_list_of_dates()[0]

    # returns list of dates in range (current_date_time, 5_days_later_date_time) with a step of 3 hours
    def get_list_of_dates_and_time(self):
        date_list = []
        for i in range(0, len(self.data.city_info)):
            date_list.append(self.data.city_info["Warszawa"]["list"][i]['dt_txt'])
        return date_list

    def get_list_of_dates(self):
        date_list = []
        p_list = self.get_list_of_dates_and_time()
        previous = ""
        for datetime in p_list:
            d, t = datetime.split()
            if previous == "" or not previous == d:
                date_list.append(d)
        return list(set(date_list))


    #returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,...
    def get_temperature_dict(self, city_name):
        temperature_list = {}
        for i in range(0,len(self.data.city_info)):
            temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] =\
                self.data.city_info[city_name]["list"][i]['main']['temp']
        return temperature_list

    def get_temperature_list_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_temperature_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return tmp_list

    # returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,...
    def get_pressure_dict(self, city_name):
        temperature_list = {}
        for i in range(0, len(self.data.city_info)):
            temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \
                self.data.city_info[city_name]["list"][i]['main']['pressure']
        return temperature_list

    def get_pressure_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_pressure_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return "{0:.2f}".format(mean(tmp_list))

    # returns dictionary: {'2019-01-26 21:00:00': -6.07, '2019-01-27 00:00:00': -5.53,...
    def get_sea_level_dict(self, city_name):
        temperature_list = {}
        for i in range(0, len(self.data.city_info)):
            temperature_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \
                self.data.city_info[city_name]["list"][i]['main']['sea_level']
        return temperature_list

    def get_sea_level_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_sea_level_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return "{0:.2f}".format(mean(tmp_list))

    #returns dicitionary: {'2019-01-26 21:00:00': 88, '2019-01-27 00:00:00': 92, '2019-01-27 03:00:00': 92,...
    def get_humidity_dict(self, city_name):
        humidity_list = {}
        for i in range(0,len(self.data.city_info)):
            humidity_list[self.data.city_info[city_name]["list"][i]['dt_txt']] =\
                self.data.city_info[city_name]["list"][i]['main']['humidity']
        return humidity_list

    def get_humidity_list_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_humidity_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return tmp_list


    #returns dictionary: {'2019-01-26 21:00:00': {'speed': 2.22, 'deg': 126.001}, '2019-01-27 00:00:00': {'speed': 2.07, 'deg': 119.001},..
    def get_wind_dict(self, city_name):
        wind_list = {}
        for i in range(0, len(self.data.city_info)):
            wind_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \
                self.data.city_info[city_name]["list"][i]['wind']
        return wind_list

    def get_wind_list(self, city_name):
        tmp_list = []
        for k, v in self.get_wind_dict(city_name).items():
                tmp_list.append(v['speed'])
        return tmp_list

    def get_wind_list_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_wind_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return tmp_list

    def get_wind_strength_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_wind_dict(city_name).items():
            if date in k:
                tmp_list.append(v['speed'])
        return tmp_list

    #returns dicitionary with pairs {'date_time', 'cloudiness_%'}
    def get_cloudiness_dict(self, city_name):
        cloud_list = {}
        for i in range(0, len(self.data.city_info)):
            cloud_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \
                self.data.city_info[city_name]["list"][i]['clouds']['all']
        return cloud_list

    def get_cloudiness_list_for_date(self, date, city_name):
        tmp_list = []
        for k, v in self.get_cloudiness_dict(city_name).items():
            if date in k:
                tmp_list.append(v)
        return tmp_list

    #returns weather description
    def get_weather_description_dict(self, city_name):
        desc_list = {}
        for i in range(0, len(self.data.city_info)):
            desc_list[self.data.city_info[city_name]["list"][i]['dt_txt']] = \
                self.data.city_info[city_name]["list"][i]['weather'][0]['description']
        return desc_list

    def get_weather_list_for_date(self, date, city_name):
        tmp_list = dict()
        for k, v in self.get_weather_description_dict(city_name).items():
            if date in k:
                tmp_list[k] = v
        return tmp_list

    def convert_to_float(self, list_to_convert):
        tmp_list = []
        for v in list(list_to_convert):
            tmp_list.append(float(v))
        return tmp_list

    def get_weather_stat_string_for_date(self, city_name, date):
        max_temp = max(self.convert_to_float(self.get_temperature_list_for_date(date, city_name)))
        min_temp = min(self.convert_to_float(self.get_temperature_list_for_date(date, city_name)))
        avg_temp = mean(self.convert_to_float(self.get_temperature_list_for_date(date, city_name)))
        wind = mean(self.convert_to_float(self.get_wind_strength_for_date(date, city_name)))
        gusts = mean(self.convert_to_float(self.get_wind_strength_for_date(date, city_name)))
        clouds = mean(self.convert_to_float(self.get_cloudiness_list_for_date(date, city_name)))

        a = ""
        a += "Date: " + date + "\n"
        a += "Maximum temperature:   " + "{0:.1f}".format(max_temp) + "°C\n"
        a += "Minimum temperature:    " + "{0:.1f}".format(min_temp) + "°C\n"
        a += "Average temperature:    " + "{0:.1f}".format(avg_temp) + "°C\n"
        a += "Wind:                               " + "{0:.2f}".format(wind) + "m/s\n"
        a += "Gusts of wind:                 " + "{0:.2f}".format(gusts) + "m/s\n"
        a += "Cloudiness:                      " + "{0:.1f}".format(clouds) + "%\n"

        return a

    def get_xticks(self):
        tab = []

        for val in self.get_list_of_dates():
            k = val.split('-')
            k = k[2] + "." + k[1]
            tab.append(k)
            tab.append("")
            tab.append("")
            tab.append("")
            tab.append("")
            tab.append("")
            tab.append("")
            tab.append("")
        return tab

    def get_additional_info_string(self, city_name, date):
        a = ""
        a += "Pressure:         " + str(self.get_pressure_for_date(date, city_name)) + "hPa\n"
        a += "Sea level:        " + str(self.get_sea_level_for_date(date, city_name)) + "hPa\n"
        a += "City info:\n"
        a += "     City:            " + str(self.data.city_info[city_name]['city']['name']) + "\n"
        a += "     Country:      " + str(self.data.city_info[city_name]['city']['country']) + "\n"
        a += "     Latitude:      " + str(self.data.city_info[city_name]['city']['coord']['lat']) + "\n"
        a += "     Longtitude:  " + str(self.data.city_info[city_name]['city']['coord']['lon']) + "\n"
        return a

    def get_weather_desc_string_for_date(self, city_name, date):
        weather = ""
        for k, v in self.get_weather_list_for_date(date, city_name).items():
            weather += k.split()[1] + ": " + v + "\n"
        return weather
コード例 #12
0
ファイル: Main.py プロジェクト: Lcorvle/DadaDebugging
def run_duti(dataset_path, trusted_item_path, feature_dir, result_path):
    """
    Run the duti algorithm to correct the label of training items.
    :param dataset_path: the path of the training dataset.
    :param trusted_item_path: the path of the trusted items.
    :param result_path: the path to save the result.
    :return: this function return nothing.
    """
    # load data from a specified txt file.
    print('Loading dataset...')
    flag, dataset = DataLoader.load_from_txt(dataset_path)
    if flag != 'Train':
        print('warning: the ' + dataset_path + ' is not a train file.')
    print('Number of training data:', len(dataset))
    print('Loading trusted items...')
    flag, trusted_items = DataLoader.load_from_txt(trusted_item_path)
    if flag != 'Trusted':
        print('warning: the ' + trusted_item_path + ' is not a trusted file.')
    print('Number of trusted items:', len(trusted_items))
    print('Merging dataset and trusted items...')
    data, labelnames, flag = DataLoader.merge_dataset_and_trusted_items(
        dataset, trusted_items)

    # use duti to fix each dataset
    if flag:
        num_class = len(labelnames)
        duti = GreedyDUTI(num_class=num_class, max_iter=20)

        # init input for the duti algorithm
        print('Loading features for training data...')
        feature = [
            DataLoader.get_feature(os.path.join(feature_dir, id + '.npy'))
            for id in data['ids']
        ]
        print('Loading features for trusted items...')
        trusted_feature = [
            DataLoader.get_feature(os.path.join(feature_dir, id + '.npy'))
            for id in data['trusted_ids']
        ]
        confidence = np.ones_like(data['trusted_labels'])
        np.savez(dataset_path.replace('Train',
                                      'Compressed').replace('.txt', '.npz'),
                 feature=np.array(feature),
                 label=np.array(data['labels']),
                 trusted_feature=np.array(trusted_feature),
                 trusted_label=np.array(data['trusted_labels']))
        print('Running the duti algorithm...')
        bugs, delta, rankings = duti.fit_transform(
            np.array(feature), np.array(data['labels']),
            np.array(trusted_feature), np.array(data['trusted_labels']),
            confidence)
        y_debug = np.array(data['labels'], copy=True)
        clean_bug_y = np.argmax(delta[bugs, :], axis=1)
        y_debug[bugs] = clean_bug_y
        np.savez(result_path.replace('Result',
                                     'Compressed').replace('.txt', '.npz'),
                 y_debug=np.array(y_debug),
                 delta=np.array(delta),
                 rankings=np.array(rankings))

        result = []
        for i, id in enumerate(data['ids']):
            if id in data['trusted_ids']:
                result.append({
                    'id':
                    id,
                    'frame':
                    data['frames'][i],
                    'label_name':
                    labelnames[data['trusted_labels'][i]],
                    'flag':
                    0,
                    'other':
                    data['others'][i]
                })
            else:
                result.append({
                    'id': id,
                    'frame': data['frames'][i],
                    'label_name': labelnames[y_debug[i]],
                    'flag': 0,
                    'other': data['others'][i]
                })

        print('Saving the result...')
        ResultSaver.save_as_txt(result_path, result)
    else:
        num_class = 2
        duti = GreedyDUTI(num_class=num_class,
                          max_iter=20,
                          method='decision tree')
        result = []
        for index, dataset in enumerate(data):
            # init input for the duti algorithm
            print('Loading features for training data...')
            feature = [
                DataLoader.get_feature(os.path.join(feature_dir, id + '.npy'))
                for id in dataset['ids']
            ]
            print('Loading features for trusted items...')
            trusted_feature = [
                DataLoader.get_feature(os.path.join(feature_dir, id + '.npy'))
                for id in dataset['trusted_ids']
            ]
            confidence = np.ones_like(dataset['trusted_labels'])
            np.savez(dataset_path.replace('Train', 'Compressed').replace(
                '.txt',
                str(index) + '.npz'),
                     feature=np.array(feature),
                     label=np.array(dataset['labels']),
                     trusted_feature=np.array(trusted_feature),
                     trusted_label=np.array(dataset['trusted_labels']))
            print('Running the duti algorithm...')
            bugs, delta, rankings = duti.fit_transform(
                np.array(feature), np.array(dataset['labels']),
                np.array(trusted_feature), np.array(dataset['trusted_labels']),
                confidence)

            print('Correct', bugs.sum(), 'bugs')

            y_debug = np.array(dataset['labels'], copy=True)
            clean_bug_y = np.argmax(delta[bugs, :], axis=1)
            y_debug[bugs] = clean_bug_y
            np.savez(result_path.replace('Result', 'Compressed').replace(
                '.txt',
                str(index) + '.npz'),
                     y_debug=np.array(y_debug),
                     delta=np.array(delta),
                     rankings=np.array(rankings))

            # for bug in range(len(bugs)):
            #     if bugs[bug]:
            #         print(dataset['ids'][bug], dataset['labels'][bug], 'to', y_debug[bug])
            for i, id in enumerate(dataset['ids']):
                if id in dataset['trusted_ids']:
                    result.append({
                        'id':
                        id,
                        'frame':
                        dataset['frames'][i],
                        'label_name':
                        labelnames[index],
                        'flag':
                        dataset['trusted_labels'][dataset['trusted_ids'].index(
                            id)] + 1,
                        'other':
                        dataset['others'][i]
                    })
                else:
                    result.append({
                        'id': id,
                        'frame': dataset['frames'][i],
                        'label_name': labelnames[index],
                        'flag': y_debug[i] + 1,
                        'other': dataset['others'][i]
                    })
        print('Saving the result...')
        ResultSaver.save_as_txt(result_path, result)
    print('===finish correcting the dataset:', dataset_path, 'by',
          trusted_item_path, 'and save the result in', result_path, '===')
コード例 #13
0
from src.Constants import *


def get_loss(y_pred, y_true):
    # Calculate the loss from digits being incorrect.  Don't count loss from
    # digits that are in non-present plates.
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                                          logits=tf.reshape(y_pred, [-1, CLASSES]),
                                          labels=tf.reshape(y_true, [-1, CLASSES]))
    loss = tf.reshape(loss, [-1, TOTAL_CHARS])
    loss = tf.reduce_sum(loss)
    return loss


dataDirectory = os.getcwd() + '\\..\\data'
dataLoader = DataLoader(dataDirectory)

training_data, validation_data, test_data = dataLoader.load_data()


mini_batchesX = [training_data[0][k:k + MINI_BATCH_SIZE] for k in range(0, len(training_data[0]), MINI_BATCH_SIZE)]
mini_batchesY = [training_data[1][k:k + MINI_BATCH_SIZE] for k in range(0, len(training_data[1]), MINI_BATCH_SIZE)]


x, y_pred, params = network.get_network()

y_true = tf.placeholder(tf.float32, [None, TOTAL_CHARS * CLASSES])

digits_loss = get_loss(y_pred, y_true)
train_step = tf.train.AdamOptimizer(1e-4).minimize(digits_loss)
コード例 #14
0
 def load_data(self):
     data_loader = DataLoader()
     train_data, label_data = data_loader.load_train_data()
     return train_data, label_data