Ejemplo n.º 1
0
def load(batch_size, batch_index):
    """
    return summarized data for a table, e.g. a list of points summary.
    ignore numeric values with $, %, ", etc., i.e., only take the ones which can be parsed by locale.atof
    """
    batch_files = training_files[batch_size * batch_index:batch_size *
                                 (batch_index + 1)]
    batch_files_nst = list(
        map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv',
            batch_files))
    batch_files_wordlist = list(
        map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv',
            batch_files))
    all_nst = [
        list(
            map(
                to_int,
                np.genfromtxt(os.path.join(training_data_dir, batch_file_nst),
                              delimiter=',')[0]))
        for batch_file_nst in batch_files_nst
    ]

    targets = np.array([
        np.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist),
                      delimiter=',')
        for batch_file_wordlist in batch_files_wordlist
    ])

    results = []
    for i in range(len(all_nst)):
        result = []
        table = Table(
            json.load(open(os.path.join(training_data_dir, batch_files[i]))))
        attributes = table.get_attributes()
        column_num = len(attributes)

        target = targets[i]
        nst = all_nst[i]

        target_transformed = [
            index_of(list(map(lambda num: to_int(num), row)), 1)
            if idx < column_num else -1
            for idx, row in enumerate(target.transpose())
        ]

        for j in range(column_num):
            if j >= 10:
                break
            if nst[j] == nst_encoding([
                    True, False, False
            ]) or nst[j] == nst_encoding([True, True, False]):
                attribute = attributes[j]
                if all(
                        list(
                            map(
                                lambda n: is_numeric(n) or n.upper() in
                                ['', 'NA', 'N/A'], attribute))):
                    result.append(summary(target_transformed[j], attribute))
        results.append(result)
    return results
Ejemplo n.º 2
0
def load_data_12k_with_raw(batch_size, batch_index=0):
    # load training data from file, to be implemented
    # put size number of data into one array
    # start from batch_index batch
    # return three arrays: raw, input, target
    batch_files = training_files_12k[batch_size * batch_index:batch_size *
                                     (batch_index + 1)]
    batch_files_ner = list(
        map(lambda batch_file: batch_file.rstrip('.json') + '_ner.csv',
            batch_files))
    batch_files_wordlist = list(
        map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv',
            batch_files))
    raws = numpy.array([
        json.load(
            open(os.path.join(training_data_12k_dir, batch_file),
                 encoding='utf-8')) for batch_file in batch_files
    ])
    inputs = numpy.array([
        numpy.genfromtxt(os.path.join(training_data_12k_dir, batch_file_ner),
                         delimiter=',') for batch_file_ner in batch_files_ner
    ])
    targets = numpy.array([
        numpy.genfromtxt(os.path.join(training_data_12k_dir,
                                      batch_file_wordlist),
                         delimiter=',')
        for batch_file_wordlist in batch_files_wordlist
    ])

    inputs_transformed = []
    targets_transformed = []

    # Use One Hot Encoding
    for i in range(len(inputs)):
        table = Table(
            json.load(open(os.path.join(training_data_12k_dir,
                                        batch_files[i]))))
        column_num = len(table.get_header())
        input = inputs[i]
        target = targets[i]
        assert len(input) == len(tag_to_index)

        inputs_transformed.append(
            numpy.array([
                int(
                    round(
                        sum(
                            numpy.array([(2**i) * num
                                         for (i, num) in enumerate(row)]))))
                if idx < column_num else -1
                for idx, row in enumerate(input.transpose())
            ]).transpose())
        targets_transformed.append(
            numpy.array([
                index_of(list(map(lambda num: int(round(num)), row)), 1)
                if idx < column_num else -1
                for idx, row in enumerate(target.transpose())
            ]).transpose())
    return numpy.array(raws), numpy.array(inputs_transformed), numpy.array(
        targets_transformed)
Ejemplo n.º 3
0
def load_nst_majo(batch_size, batch_index):
    batch_files = training_files[batch_size * batch_index:batch_size * (batch_index + 1)]
    batch_files_nst = list(map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv', batch_files))
    batch_files_wordlist = list(map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files))
    inputs_major = [
        list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[0])) for
        batch_file_nst in batch_files_nst]
    inputs_max = [
        list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[1])) for
        batch_file_nst in batch_files_nst]
    inputs_overall = [
        list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[2])) for
        batch_file_nst in batch_files_nst]
    targets = numpy.array(
        [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist
         in batch_files_wordlist])

    targets_transformed = []

    for i in range(len(targets)):
        table = Table(json.load(open(os.path.join(training_data_dir, batch_files[i]))))
        column_num = len(table.get_header())
        target = targets[i]

        targets_transformed.append(
            numpy.array([index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for
                         idx, row in enumerate(target.transpose())]).transpose())
    return numpy.array([[x[0] * 4 + x[2] * 1 for x in zip(inputs[0], inputs[1], inputs[2])] for inputs in
                        zip(inputs_major, inputs_max, inputs_overall)]), numpy.array(targets_transformed)
Ejemplo n.º 4
0
def load_sample_random_label(sample_index, batch_size, batch_index):
    # load testing data of sample with random labels
    # put size number of data into one array
    # start from batch_index batch
    result = []
    batch_files = testing_files_random_label[sample_index][
        batch_size * batch_index:batch_size * (batch_index + 1)]
    for batch_file in batch_files:
        table = Table(
            json.load(
                open(os.path.join(testing_data_random_label_dir, batch_file))))
        column_num = len(table.get_header())
        batch_file_ner = batch_file.rstrip('.json') + '_ner.csv'
        batch_file_wordlist = batch_file.rstrip('.json') + '_wordlist.csv'
        batch_file_activate = batch_file.rstrip('.json') + '_activate.json'
        input = numpy.genfromtxt(os.path.join(testing_data_random_label_dir,
                                              batch_file_ner),
                                 delimiter=',').transpose()
        target = numpy.genfromtxt(os.path.join(testing_data_random_label_dir,
                                               batch_file_wordlist),
                                  delimiter=',').transpose()
        activate = json.load(
            open(
                os.path.join(activate_data_random_label_dir,
                             batch_file_activate)))

        input_transformed = [
            int(
                round(
                    sum(
                        numpy.array([(2**i) * num
                                     for (i, num) in enumerate(row)]))))
            if idx < column_num else -1 for idx, row in enumerate(input)
        ]
        target_transformed = [
            index_of(list(map(lambda num: int(round(num)), row)), 1)
            if idx < column_num else -1 for idx, row in enumerate(target)
        ]
        activate_transformed = [
            num if idx < column_num else -1 for idx, num in enumerate(activate)
        ]

        result.append(
            [input_transformed, target_transformed, activate_transformed])
    return result
Ejemplo n.º 5
0
def load_data(batch_size, batch_index=0):
    # load training data from file, to be implemented
    # put size number of data into one array
    # start from batch_index batch
    # return two arrays: input, target
    batch_files = training_files[batch_size * batch_index:batch_size * (batch_index + 1)]
    batch_files_ner = list(map(lambda batch_file: batch_file.rstrip('.json') + '_ner.csv', batch_files))
    batch_files_nst = list(map(lambda batch_file: batch_file.rstrip('.json') + '_nst.csv', batch_files))
    batch_files_date = list(map(lambda batch_file: batch_file.rstrip('.json') + '_date.csv', batch_files))
    batch_files_wordlist = list(map(lambda batch_file: batch_file.rstrip('.json') + '_wordlist.csv', batch_files))
    ner_inputs = numpy.array(
        [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_ner), delimiter=',') for batch_file_ner in
         batch_files_ner])
    nst_inputs = numpy.array(
        [list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_nst), delimiter=',')[0])) for
         batch_file_nst in batch_files_nst])
    date_inputs = numpy.array(
        [list(map(to_int, numpy.genfromtxt(os.path.join(training_data_dir, batch_file_date), delimiter=','))) for
         batch_file_date in batch_files_date])
    targets = numpy.array(
        [numpy.genfromtxt(os.path.join(training_data_dir, batch_file_wordlist), delimiter=',') for batch_file_wordlist
         in batch_files_wordlist])

    inputs_transformed = []
    targets_transformed = []

    assert len(ner_inputs) == len(nst_inputs)
    assert len(ner_inputs) == len(date_inputs)
    for i in range(len(ner_inputs)):
        # print(batch_files[i])
        table = Table(json.load(open(os.path.join(training_data_dir, batch_files[i]))))
        column_num = len(table.get_header())
        attributes = table.get_attributes()
        ner_input = ner_inputs[i]
        nst_input = nst_inputs[i]
        date_input = date_inputs[i]
        target = targets[i]
        assert len(ner_input) == len(tag_to_index)

        # Encode 3 class NER (4:location, 5:person, 6:organization)
        new_input_transformed = numpy.array([int(round(sum([(2 ** (i + 3)) * num for (i, num) in enumerate(ner_row)])))
                                             if idx < column_num else -1 for idx, ner_row in
                                             enumerate(ner_input.transpose())]).transpose()
        # print('ner', new_input_transformed)
        # Add encoded NST and date (1:text, 2:symbol, 3:number, 7:date)
        new_input_transformed = new_input_transformed + numpy.array(nst_input) + numpy.array(date_input) * (2 ** 6)
        # print('nst', numpy.array(nst_input))
        # print('date', numpy.array(date_input) * (2 ** 6))

        # Check is_numeric, is_float and is_ordered (8:is_numeric, 9:is_float, 10:is_ordered)
        is_numeric_input = [-1] * 10
        is_float_input = [-1] * 10
        is_ordered_input = [-1] * 10
        for idx in range(min(column_num, 10)):
            is_numeric_input[idx] = 0
            is_float_input[idx] = 0
            is_ordered_input[idx] = 0
            if nst_input[idx] == nst_encoding([True, False, False]) or \
                    nst_input[idx] == nst_encoding([True, True, False]):
                attribute = attributes[idx]
                if all(list(map(lambda n: is_numeric(n) or n.upper() in ['', 'NA', 'N/A'], attribute))):
                    is_numeric_input[idx] = 1
                    is_float_input[idx] = int('.' in ''.join(attribute))
                    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
                    values = numpy.array(list(map(locale.atof, filter(is_numeric, attribute))))
                    # 0: random, 1: desc, 2: asc
                    is_ordered_input[idx] = 2 if numpy.all(numpy.diff(values) > 0) else \
                        1 if numpy.all(numpy.diff(values) < 0) else 0

        new_input_transformed = new_input_transformed + \
                                numpy.array(is_numeric_input) * (2 ** 7) + \
                                numpy.array(is_float_input) * (2 ** 8) + \
                                numpy.array(is_ordered_input) * (2 ** 9)
        # print('is_numeric', numpy.array(is_numeric_input) * (2 ** 7))
        # print('is_float', numpy.array(is_float_input) * (2 ** 8))
        # print('is_ordered', numpy.array(is_ordered_input) * (2 ** 9))

        # Change all negative values to -1 (empty column)
        new_input_transformed = numpy.array([x if x >= 0 else -1 for x in new_input_transformed])
        # print('overall', new_input_transformed)

        inputs_transformed.append(new_input_transformed)

        targets_transformed.append(
            numpy.array([index_of(list(map(lambda num: int(round(num)), row)), 1) if idx < column_num else -1 for
                         idx, row in enumerate(target.transpose())]).transpose())
    return numpy.array(inputs_transformed), numpy.array(targets_transformed)