def load(batch_size, batch_index): """ return summarized data for a table, e.g. a list of points summary. ignore numeric values with $, %, ", etc., i.e., only take the ones which can be parsed by locale.atof """ batch_files = training_files[batch_size * batch_index:batch_size * (batch_index + 1)] batch_files_nst = list( map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv', batch_files)) batch_files_wordlist = list( map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files)) all_nst = [ list( map( to_int, np.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[0])) for batch_file_nst in batch_files_nst ] targets = np.array([ np.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist in batch_files_wordlist ]) results = [] for i in range(len(all_nst)): result = [] table = Table( json.load(open(os.path.join(training_data_dir, batch_files[i])))) attributes = table.get_attributes() column_num = len(attributes) target = targets[i] nst = all_nst[i] target_transformed = [ index_of(list(map(lambda num: to_int(num), row)), 1) if idx < column_num else -1 for idx, row in enumerate(target.transpose()) ] for j in range(column_num): if j >= 10: break if nst[j] == nst_encoding([ True, False, False ]) or nst[j] == nst_encoding([True, True, False]): attribute = attributes[j] if all( list( map( lambda n: is_numeric(n) or n.upper() in ['', 'NA', 'N/A'], attribute))): result.append(summary(target_transformed[j], attribute)) results.append(result) return results
def load_data_12k_with_raw(batch_size, batch_index=0): # load training data from file, to be implemented # put size number of data into one array # start from batch_index batch # return three arrays: raw, input, target batch_files = training_files_12k[batch_size * batch_index:batch_size * (batch_index + 1)] batch_files_ner = list( map(lambda batch_file: batch_file.rstrip('.json') + '_ner.csv', batch_files)) batch_files_wordlist = list( map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files)) raws = numpy.array([ json.load( open(os.path.join(training_data_12k_dir, batch_file), encoding='utf-8')) for batch_file in batch_files ]) inputs = numpy.array([ numpy.genfromtxt(os.path.join(training_data_12k_dir, batch_file_ner), delimiter=',') for batch_file_ner in batch_files_ner ]) targets = numpy.array([ numpy.genfromtxt(os.path.join(training_data_12k_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist in batch_files_wordlist ]) inputs_transformed = [] targets_transformed = [] # Use One Hot Encoding for i in range(len(inputs)): table = Table( json.load(open(os.path.join(training_data_12k_dir, batch_files[i])))) column_num = len(table.get_header()) input = inputs[i] target = targets[i] assert len(input) == len(tag_to_index) inputs_transformed.append( numpy.array([ int( round( sum( numpy.array([(2**i) * num for (i, num) in enumerate(row)])))) if idx < column_num else -1 for idx, row in enumerate(input.transpose()) ]).transpose()) targets_transformed.append( numpy.array([ index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for idx, row in enumerate(target.transpose()) ]).transpose()) return numpy.array(raws), numpy.array(inputs_transformed), numpy.array( targets_transformed)
def load_nst_majo(batch_size, batch_index): batch_files = training_files[batch_size * batch_index:batch_size * (batch_index + 1)] batch_files_nst = list(map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv', batch_files)) batch_files_wordlist = list(map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files)) inputs_major = [ list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[0])) for batch_file_nst in batch_files_nst] inputs_max = [ list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[1])) for batch_file_nst in batch_files_nst] inputs_overall = [ list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[2])) for batch_file_nst in batch_files_nst] targets = numpy.array( [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist in batch_files_wordlist]) targets_transformed = [] for i in range(len(targets)): table = Table(json.load(open(os.path.join(training_data_dir, batch_files[i])))) column_num = len(table.get_header()) target = targets[i] targets_transformed.append( numpy.array([index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for idx, row in enumerate(target.transpose())]).transpose()) return numpy.array([[x[0] * 4 + x[2] * 1 for x in zip(inputs[0], inputs[1], inputs[2])] for inputs in zip(inputs_major, inputs_max, inputs_overall)]), numpy.array(targets_transformed)
def load_sample_random_label(sample_index, batch_size, batch_index): # load testing data of sample with random labels # put size number of data into one array # start from batch_index batch result = [] batch_files = testing_files_random_label[sample_index][ batch_size * batch_index:batch_size * (batch_index + 1)] for batch_file in batch_files: table = Table( json.load( open(os.path.join(testing_data_random_label_dir, batch_file)))) column_num = len(table.get_header()) batch_file_ner = batch_file.rstrip('.json') + '_ner.csv' batch_file_wordlist = batch_file.rstrip('.json') + '_wordlist.csv' batch_file_activate = batch_file.rstrip('.json') + '_activate.json' input = numpy.genfromtxt(os.path.join(testing_data_random_label_dir, batch_file_ner), delimiter=',').transpose() target = numpy.genfromtxt(os.path.join(testing_data_random_label_dir, batch_file_wordlist), delimiter=',').transpose() activate = json.load( open( os.path.join(activate_data_random_label_dir, batch_file_activate))) input_transformed = [ int( round( sum( numpy.array([(2**i) * num for (i, num) in enumerate(row)])))) if idx < column_num else -1 for idx, row in enumerate(input) ] target_transformed = [ index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for idx, row in enumerate(target) ] activate_transformed = [ num if idx < column_num else -1 for idx, num in enumerate(activate) ] result.append( [input_transformed, target_transformed, activate_transformed]) return result
def load_data(batch_size, batch_index=0): # load training data from file, to be implemented # put size number of data into one array # start from batch_index batch # return two arrays: input, target batch_files = training_files[batch_size * batch_index:batch_size * (batch_index + 1)] batch_files_ner = list(map(lambda batch_file: batch_file.rstrip('.json') + '_ner.csv', batch_files)) batch_files_nst = list(map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv', batch_files)) batch_files_date = list(map(lambda batch_file: batch_file.rstrip('.json') + '_date.csv', batch_files)) batch_files_wordlist = list(map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files)) ner_inputs = numpy.array( [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_ner), delimiter=',') for batch_file_ner in batch_files_ner]) nst_inputs = numpy.array( [list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[0])) for batch_file_nst in batch_files_nst]) date_inputs = numpy.array( [list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_date), delimiter=','))) for batch_file_date in batch_files_date]) targets = numpy.array( [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist in batch_files_wordlist]) inputs_transformed = [] targets_transformed = [] assert len(ner_inputs) == len(nst_inputs) assert len(ner_inputs) == len(date_inputs) for i in range(len(ner_inputs)): # print(batch_files[i]) table = Table(json.load(open(os.path.join(training_data_dir, batch_files[i])))) column_num = len(table.get_header()) attributes = table.get_attributes() ner_input = ner_inputs[i] nst_input = nst_inputs[i] date_input = date_inputs[i] target = targets[i] assert len(ner_input) == len(tag_to_index) # Encode 3 class NER (4:location, 5:person, 6:organization) new_input_transformed = numpy.array([int(round(sum([(2 ** (i + 3)) * num for (i, num) in enumerate(ner_row)]))) if idx < column_num else -1 for idx, ner_row in enumerate(ner_input.transpose())]).transpose() # print('ner', new_input_transformed) # Add encoded NST and date (1:text, 2:symbol, 3:number, 7:date) new_input_transformed = new_input_transformed + numpy.array(nst_input) + numpy.array(date_input) * (2 ** 6) # print('nst', numpy.array(nst_input)) # print('date', numpy.array(date_input) * (2 ** 6)) # Check is_numeric, is_float and is_ordered (8:is_numeric, 9:is_float, 10:is_ordered) is_numeric_input = [-1] * 10 is_float_input = [-1] * 10 is_ordered_input = [-1] * 10 for idx in range(min(column_num, 10)): is_numeric_input[idx] = 0 is_float_input[idx] = 0 is_ordered_input[idx] = 0 if nst_input[idx] == nst_encoding([True, False, False]) or \ nst_input[idx] == nst_encoding([True, True, False]): attribute = attributes[idx] if all(list(map(lambda n: is_numeric(n) or n.upper() in ['', 'NA', 'N/A'], attribute))): is_numeric_input[idx] = 1 is_float_input[idx] = int('.' in ''.join(attribute)) locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') values = numpy.array(list(map(locale.atof, filter(is_numeric, attribute)))) # 0: random, 1: desc, 2: asc is_ordered_input[idx] = 2 if numpy.all(numpy.diff(values) > 0) else \ 1 if numpy.all(numpy.diff(values) < 0) else 0 new_input_transformed = new_input_transformed + \ numpy.array(is_numeric_input) * (2 ** 7) + \ numpy.array(is_float_input) * (2 ** 8) + \ numpy.array(is_ordered_input) * (2 ** 9) # print('is_numeric', numpy.array(is_numeric_input) * (2 ** 7)) # print('is_float', numpy.array(is_float_input) * (2 ** 8)) # print('is_ordered', numpy.array(is_ordered_input) * (2 ** 9)) # Change all negative values to -1 (empty column) new_input_transformed = numpy.array([x if x >= 0 else -1 for x in new_input_transformed]) # print('overall', new_input_transformed) inputs_transformed.append(new_input_transformed) targets_transformed.append( numpy.array([index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for idx, row in enumerate(target.transpose())]).transpose()) return numpy.array(inputs_transformed), numpy.array(targets_transformed)