def training_set(events_file: str, no_event_time_shift: int, table_name: str, directory): logging.info('start') # stiahnutie dat con = ConnectionUtil.create_con() storage = Storage(events_file, no_event_time_shift, table_name) d = storage.load_data(con, 0, 0, 'co2_in_ppm') logging.info('downloaded events: %d' % len(d)) # aplikovanie filtrov na eventy filtered = FilterUtil.only_valid_events(d) # for travis no_ev_records = no_events_records if ConnectionUtil.is_testable_system(): filtered = filtered[:ConnectionUtil.MAX_TESTABLE_EVENTS] no_ev_records = no_events_records[:ConnectionUtil.MAX_TESTABLE_EVENTS] logging.info('events after applying the filter: %d' % len(filtered)) # selector pre data row_selector = CachedDiffRowWithIntervalSelector(con, table_name, 0, 0) interval_selector = None # trenovacia mnozina logging.info('start computing of training set') training, tr_events = AttributeUtil.cached_training_data( con, table_name, filtered, func, row_selector, interval_selector, 'open', '{0}/training_cached.csv'.format(directory)) count = len(training) logging.info('training set contains %d events (%d records)' % (count / 2, count)) GraphUtil.gen_duration_histogram(tr_events, 'save', ['png'], 'Histogram dlzok vetrania', [x for x in range(5, 60, 5)], 1) training2 = AttributeUtil.additional_training_set(con, table_name, no_ev_records, func, row_selector, interval_selector) count2 = len(training2) logging.info('additional training set contains %d records' % count2) logging.info('end computing of training set') logging.info('start preparing file of training set') balanced = AttributeUtil.balance_set(training, training2) CSVUtil.create_csv_file(balanced, '{0}/training.csv'.format(directory)) logging.info('end preparing file of training set')
def training_set(events_file: str, no_event_time_shift: int, table_name: str): logging.info('start') # download data con = ConnectionUtil.create_con() storage = Storage(events_file, no_event_time_shift, table_name) d = storage.load_data(con, 0, 0, 'rh_in2_specific_g_kg') logging.info('downloaded events: %d' % len(d)) # apply filters to data filtered = FilterUtil.only_valid_events(d) # filtered = FilterUtil.temperature_diff(filtered, 5, 100) # filtered = FilterUtil.temperature_out_max(filtered, 15) # filtered = FilterUtil.humidity(filtered, 6, 1.6, 100) # for travis no_ev_records = no_events_records if ConnectionUtil.is_testable_system(): filtered = filtered[:ConnectionUtil.MAX_TESTABLE_EVENTS] no_ev_records = no_events_records[:ConnectionUtil.MAX_TESTABLE_EVENTS] logging.info('events after applying the filter: %d' % len(filtered)) row_selector = CachedDiffRowWithIntervalSelector(con, table_name, 0, 0) interval_selector = SimpleIntervalSelector(con, table_name) logging.info('start computing of training set') training, tr_events = AttributeUtil.training_data(con, table_name, filtered, func, row_selector, interval_selector, 'open') count = len(training) logging.info('training set contains %d events (%d records)' % (count / 2, count)) training2 = AttributeUtil.additional_training_set(con, table_name, no_ev_records, func, row_selector, interval_selector) count2 = len(training2) logging.info('additional training set contains %d records' % count2) logging.info('end computing of training set') logging.info('start preparing file of training set') balanced = AttributeUtil.balance_set(training, training2) CSVUtil.create_csv_file(balanced, 'training.csv') logging.info('end preparing file of training set')
def testing_set(table_name: str, start, end, filename): logging.info('start') con = ConnectionUtil.create_con() logging.info('start computing of testing set') length = AttributeUtil.testing_data_with_write(con, table_name, start, end, 30, func, None, None, 'open', filename) logging.info('testing set contains %d records' % length) logging.info('end computing of testing set') logging.info('end')
def training_set(events_file: str, no_event_time_shift: int, table_name: str): logging.info('start') # stiahnutie dat con = ConnectionUtil.create_con() storage = Storage(events_file, no_event_time_shift, table_name) d = storage.load_data(con, 0, 0, 'co2_in_ppm') logging.info('downloaded events: %d' % len(d)) # aplikovanie filtrov na eventy filtered = FilterUtil.only_valid_events(d) # for travis if ConnectionUtil.is_testable_system(): filtered = filtered[:ConnectionUtil.MAX_TESTABLE_EVENTS] logging.info('events after applying the filter: %d' % len(filtered)) # selector pre data row_selector = CachedDiffRowWithIntervalSelector(con, table_name, 0, 0) interval_selector = None # datova mnozina logging.info('start computing of data set') data = AttributeUtil.training_data_without_opposite( con, table_name, filtered, func, row_selector, interval_selector) logging.info('data set contains %d events' % len(data)) logging.info('end computing of data set') # generovanie suborov logging.info('start preparing file of training and testing set') random.seed(len(data) // 2) random.shuffle(data) CSVUtil.create_csv_file(data, 'data.csv') logging.info('end preparing file of training and testing set') logging.info('end')
def main(events_file: str, no_event_time_shift: int): logging.info('start') table_name = 'measured_klarka' # download data con = ConnectionUtil.create_con() storage = Storage(events_file, no_event_time_shift, table_name) d = storage.load_data(con, 0, 0, 'rh_in2_specific_g_kg') logging.info('downloaded events: %d' % len(d)) # apply filters to data filtered = FilterUtil.only_valid_events(d) # filtered = FilterUtil.temperature_diff(filtered, 5, 17.5) # filtered = FilterUtil.temperature_diff(filtered, 17.5, 30) # filtered = FilterUtil.temperature_diff(filtered, 5, 13.3) # filtered = FilterUtil.temperature_diff(filtered, 13.3, 21.6) # filtered = FilterUtil.temperature_diff(filtered, 21.6, 30) # filtered = FilterUtil.temperature_diff(filtered, 10, 15) # filtered = FilterUtil.temperature_diff(filtered, 15, 20) # filtered = FilterUtil.temperature_diff(filtered, 20, 25) logging.info('events after applying the filter: %d' % len(filtered)) row_selector = CachedDiffRowWithIntervalSelector(con, table_name, 0, 0) interval_selector = SimpleIntervalSelector(con, table_name) # data set logging.info('start computing of data set') data = AttributeUtil.training_data_without_opposite( con, table_name, filtered, func, row_selector, interval_selector) logging.info('data set contains %d events' % len(data)) logging.info('end computing of data set') # split data set into training and testing set random.seed(len(data) // 2) random.shuffle(data) training, testing, minimum = training_testing_data(data, 0.7) logging.info('training set contains %d records, each %d-krat' % (len(training), minimum)) logging.info('testing set contains %d records' % len(testing)) training_testing_data_with_distance(copy.deepcopy(training), copy.deepcopy(testing), 0, CenterLineSlope(), "trendline_", False, False, False, False) training_testing_data_with_distance(copy.deepcopy(training), copy.deepcopy(testing), 1, PolyfitLineAvgSlope(), "polyfit_", False, False, False, False) training_testing_data_with_distance(copy.deepcopy(training), copy.deepcopy(testing), 2, CenterLineSlope(), "center_", False, False, False, False) training_testing_data_only_distance(copy.deepcopy(training), copy.deepcopy(testing), 3, CenterLineSlope(), "trendline_", False, False, False, False) training_testing_data_only_distance(copy.deepcopy(training), copy.deepcopy(testing), 4, PolyfitLineAvgSlope(), "polyfit_", False, False, False, False) training_testing_data_only_distance(copy.deepcopy(training), copy.deepcopy(testing), 5, CenterLineSlope(), "center_", False, False, False, False) training_testing_data_without_distance(copy.deepcopy(training), copy.deepcopy(testing), 6, CenterLineSlope(), "trendline_", False, False, False, False) logging.info('end')