コード例 #1
0
ファイル: test_dataset.py プロジェクト: pep8speaks/dave
def test_apply_filters(s, t, c, list, min_value, max_value):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    for v in list:
        dataset1.tables[t].columns[c].add_value(v)

    filter = dict()
    filter["table"] = t
    filter["column"] = c
    filter["from"] = min_value
    filter["to"] = max_value

    filtered_dataset = dataset1.apply_filters([filter, filter])
    schema = filtered_dataset.get_schema()

    table_and_column_in_schema = t in schema and schema[t] and c in schema[t]
    column_has_values_inside_range = (
        schema[t][c]["count"] > 0 and schema[t][c]["min_value"] >= min_value
        and schema[t][c]["max_value"] <= max_value)
    column_is_empty = schema[t][c]["count"] == 0
    wrong_filter_range = min_value > max_value

    assert table_and_column_in_schema and (column_has_values_inside_range
                                           or column_is_empty
                                           or wrong_filter_range)
コード例 #2
0
ファイル: test_dataset.py プロジェクト: StingraySoftware/dave
def test_apply_filters(s, t, c, list, min_value, max_value):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    for v in list:
        dataset1.tables[t].columns[c].add_value(v, v)

    filter = dict()
    filter["table"] = t
    filter["column"] = c
    filter["from"] = min_value
    filter["to"] = max_value

    filtered_dataset = dataset1.apply_filters([filter, filter])
    schema = filtered_dataset.get_schema()

    assert t in schema
    assert schema[t]
    assert c in schema[t]
    assert "count" in schema[t][c]

    filteredItemsCount = schema[t][c]["count"]
    if filteredItemsCount > 0 and min_value <= max_value:
        assert schema[t][c]["min_value"] >= min_value
        assert schema[t][c]["max_value"] <= max_value

    elif filteredItemsCount == 0:
        assert schema[t][c]["count"] == 0

    else:
        assert schema[t][c]["count"] == len(list)
コード例 #3
0
def detect_classifier(ce_matrix):
	cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml'))

	A1 = cbn.v['age']
	A2 = cbn.v['education']
	S = cbn.v['sex']
	M1 = cbn.v['workclass']
	M2 = cbn.v['marital-status']
	N = cbn.v['hours']
	Y = cbn.v['income']

	for i in [0, 1, 2, 3]:  # two datasets generated by two methods

		test = DataSet (pd.read_csv ('temp/adult_binary_test_prediction%d.csv' % i))
		for j, label in enumerate (['LR', 'SVM']):  # two classifiers

			# modify cpt of label before detect
			for a1, a2, n, m1, m2, s, y in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
													S.domains.get_all (), Y.domains.get_all ()):
				cbn.set_conditional_prob (Event ({Y: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}),
										  test.get_conditional_prob (Event ({label: y}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})))

			cbn.build_joint_table ()
			for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])):
				p_u, p_l = detect_after_remove (cbn=cbn, s=spos, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				p = detect_after_remove (cbn=cbn, s=sneg, sprime=sneg, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				ce_matrix.iloc[j * 32 + k, 2 * i:2 * i + 2] = [p_u - p, p_l - p]

			for k, (a1prime, a2prime, m1prime, m2prime) in enumerate (product ([0, 1], [0, 1], [0, 1], [0, 1])):
				p_u, p_l = detect_after_remove (cbn=cbn, s=sneg, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				p = detect_after_remove (cbn=cbn, s=spos, sprime=spos, y=1, a1prime=a1prime, a2prime=a2prime, m1prime=m1prime, m2prime=m2prime)
				ce_matrix.iloc[j * 32 + k + 16, 2 * i:2 * i + 2] = [p_u - p, p_l - p]
コード例 #4
0
ファイル: test_dataset.py プロジェクト: ayush1999/dave
def test_apply_filters(s, t, c, list, min_value, max_value):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    for v in list:
        dataset1.tables[t].columns[c].add_value(v, v)

    filter = dict()
    filter["table"] = t
    filter["column"] = c
    filter["from"] = min_value
    filter["to"] = max_value

    filtered_dataset = dataset1.apply_filters([filter, filter])
    schema = filtered_dataset.get_schema()

    assert t in schema
    assert schema[t]
    assert c in schema[t]
    assert "count" in schema[t][c]

    filteredItemsCount = schema[t][c]["count"]
    if filteredItemsCount > 0 and min_value <= max_value:
        assert schema[t][c]["min_value"] >= min_value
        assert schema[t][c]["max_value"] <= max_value

    elif filteredItemsCount == 0:
        assert schema[t][c]["count"] == 0

    else:
        assert schema[t][c]["count"] == len(list)
コード例 #5
0
def get_fits_dataset(destination, dsId, table_ids):
    hdulist = fits.open(destination)
    dataset = DataSet(dsId)

    for t in range(len(hdulist)):

        if isinstance(hdulist[t], fits.hdu.table.BinTableHDU):
            table_id = table_ids[t]
            header_names = hdulist[t].columns.names
            tbdata = hdulist[t].data
            dataset.add_table(table_id, header_names)

            for i in range(len(header_names)):
                header_name = header_names[i]
                dataset.tables[table_id].columns[
                    header_name].values = np.append([], tbdata.field(i))

        else:
            logging.debug("No valid data on: %s" % t)
            logging.debug("Type of Data: %s" % type(hdulist[t]))

    hdulist.close()

    logging.debug("Read fits file successfully: %s" % destination)

    return dataset
コード例 #6
0
ファイル: test_dataset.py プロジェクト: pep8speaks/dave
def test_get_shema(s, t, c, v):
    dataset = DataSet(s)
    dataset.add_table(t, [c])
    dataset.tables[t].columns[c].add_value(v)
    schema = dataset.get_schema()

    assert t in schema and schema[t] and c in schema[t] and schema[t][c][
        "id"] == c and "count" in schema[t][c] and schema[t][c]["count"] == 1
コード例 #7
0
ファイル: test_dataset.py プロジェクト: StingraySoftware/dave
def test_clone(s, t, c, v, e):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    dataset1.tables[t].columns[c].add_value(v, e)
    schema1 = dataset1.get_schema()
    dataset2 = dataset1.clone()
    schema2 = dataset2.get_schema()
    assert schema1 == schema2
コード例 #8
0
ファイル: test_dataset.py プロジェクト: StingraySoftware/dave
def test_get_schema(s, t, c, v, e):
    dataset = DataSet(s)
    dataset.add_table(t, [c])
    dataset.tables[t].columns[c].add_value(v, e)
    schema = dataset.get_schema()

    assert t in schema
    assert schema[t]
    assert c in schema[t]
    assert schema[t][c]["id"] == c
    assert "count" in schema[t][c]
    assert schema[t][c]["count"] == 1
コード例 #9
0
ファイル: dave_reader.py プロジェクト: pbalm/dave
def get_txt_dataset(destination, table_id, header_names):

    data = np.loadtxt(destination)

    dataset = DataSet( table_id )
    dataset.add_table( table_id, header_names )

    for i in range(len(header_names)):
        header_name = header_names[i]
        dataset.tables[table_id].columns[header_name].values = data[0:len(data),i]

    logging.debug("Read txt file successfully: %s" % destination)

    return dataset
コード例 #10
0
ファイル: dave_reader.py プロジェクト: pbalm/dave
def get_fits_dataset(destination, table_id):
    hdulist = fits.open(destination)
    tbdata = hdulist[1].data

    header_names = hdulist[1].columns.names
    dataset = DataSet( table_id )
    dataset.add_table( table_id, header_names )

    for i in range(len(header_names)):
        header_name = header_names[i]
        dataset.tables[table_id].columns[header_name].values = tbdata.field(i)

    logging.debug("Read lc file successfully: %s" % destination)

    return dataset
コード例 #11
0
ファイル: dave_reader.py プロジェクト: pep8speaks/dave
def get_fits_dataset(destination, table_id):
    hdulist = fits.open(destination)
    tbdata = hdulist[1].data

    header_names = hdulist[1].columns.names
    dataset = DataSet(table_id)
    dataset.add_table(table_id, header_names)

    for i in range(len(header_names)):
        header_name = header_names[i]
        dataset.tables[table_id].columns[header_name].values = tbdata.field(i)

    logging.debug("Read lc file successfully: %s" % destination)

    return dataset
コード例 #12
0
ファイル: dave_reader.py プロジェクト: pep8speaks/dave
def get_txt_dataset(destination, table_id, header_names):

    data = np.loadtxt(destination)

    dataset = DataSet(table_id)
    dataset.add_table(table_id, header_names)

    for i in range(len(header_names)):
        header_name = header_names[i]
        dataset.tables[table_id].columns[header_name].values = data[
            0:len(data), i]

    logging.debug("Read txt file successfully: %s" % destination)

    return dataset
コード例 #13
0
ファイル: app.py プロジェクト: suaraksara/data-exploration
def getmenu():
    if not os.path.isfile(paths.DATASETS_JSON):
        dataset_info = []

        # Read in CSV
        datasets_csv = pd.read_csv(paths.DATASETS)

        # Get title, filename, id, and label for each data set and add it to the collection
        for i, row in datasets_csv.iterrows():
            dataset_title = row["Title"]
            dataset_filename = row["FileName"]
            dataset_id = row["ID"]
            dataset_label = row["Label"]
            dataset = DataSet(dataset_filename, dataset_title, dataset_id,
                              dataset_label)
            dataset_info.append(dataset)

        # Save the collection as JSON and return it
        datasets = DataSets(dataset_info=dataset_info)
        datasets_json = jsonpickle.encode(datasets)

        # Save the serialized JSON to a file
        with open(paths.DATASETS_JSON, 'w') as file:
            file.write(datasets_json)
    else:
        with open(paths.DATASETS_JSON, 'r') as serialized_file:
            json_str = serialized_file.read()
            datasets_json = jsonpickle.decode(json_str)

    return datasets_json
コード例 #14
0
ファイル: test_dataset.py プロジェクト: ayush1999/dave
def test_join(s, t, c, v0, e0, v1, e1):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    dataset1.tables[t].columns[c].add_value(v0, e0)
    dataset2 = DataSet(s)
    dataset2.add_table(t, [c])
    dataset2.tables[t].columns[c].add_value(v1, e0)

    dataset1 = dataset1.join(dataset2)
    schema = dataset1.get_schema()

    assert t in schema
    assert schema[t]
    assert c in schema[t]
    assert "count" in schema[t][c]
    assert schema[t][c]["count"] == 2
コード例 #15
0
def get_txt_dataset(destination, table_id, header_names):

    data = np.loadtxt(destination)

    dataset = DataSet(table_id)
    dataset.add_table(table_id, header_names)

    # Column1, Column1Err, Column2, Column2Err .. header order expected
    for i in range(len(header_names)):
        header_name = header_names[i]
        column = dataset.tables[table_id].columns[header_name]
        column.values = data[0:len(data), i * 2]
        column.error_values = data[0:len(data), (i * 2) + 1]

    logging.debug("Read txt file successfully: %s" % destination)

    return dataset
コード例 #16
0
ファイル: test_dataset.py プロジェクト: ayush1999/dave
def test_clone(s, t, c, v, e):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    dataset1.tables[t].columns[c].add_value(v, e)
    schema1 = dataset1.get_schema()
    dataset2 = dataset1.clone()
    schema2 = dataset2.get_schema()
    assert schema1 == schema2
コード例 #17
0
ファイル: test_dataset.py プロジェクト: StingraySoftware/dave
def test_join(s, t, c, v0, e0, v1, e1):
    dataset1 = DataSet(s)
    dataset1.add_table(t, [c])
    dataset1.tables[t].columns[c].add_value(v0, e0)
    dataset2 = DataSet(s)
    dataset2.add_table(t, [c])
    dataset2.tables[t].columns[c].add_value(v1, e0)

    dataset1 = dataset1.join(dataset2)
    schema = dataset1.get_schema()

    assert t in schema
    assert schema[t]
    assert c in schema[t]
    assert "count" in schema[t][c]
    assert schema[t][c]["count"] == 2
コード例 #18
0
def get_fits_dataset_with_stingray(destination,
                                   dsId='FITS',
                                   hduname='EVENTS',
                                   column='TIME'):

    # Gets columns from fits hdu table
    columns = get_fits_table_column_names(destination, hduname)

    # Prepares additional_columns
    additional_columns = []
    for i in range(len(columns)):
        if columns[i] != column:
            additional_columns = np.append(additional_columns, columns[i])

    # Reads fits data
    fits_data = load_events_and_gtis(destination,
                                     additional_columns=additional_columns)

    # Creates the dataset
    dataset = DataSet(dsId)

    #Fills Hdu table
    dataset.add_table(hduname, columns)
    dataset.tables[hduname].columns[column].add_values(fits_data.ev_list)
    for i in range(len(additional_columns)):
        column = additional_columns[i]
        dataset.tables[hduname].columns[column].add_values(
            fits_data.additional_data[column])

    #Fills Gtis table
    gti_columns = ["START", "STOP"]
    gti_start = fits_data.gti_list[:, 0]
    gti_end = fits_data.gti_list[:, 1]
    dataset.add_table("GTI", gti_columns)
    dataset.tables["GTI"].columns[gti_columns[0]].add_values(gti_start)
    dataset.tables["GTI"].columns[gti_columns[1]].add_values(gti_end)

    logging.debug("Read fits with stingray file successfully: %s" %
                  destination)

    return dataset
コード例 #19
0
ファイル: test_dataset.py プロジェクト: ayush1999/dave
def test_add_table(s, t, c):
    dataset = DataSet(s)
    dataset.add_table(t, [c])
    assert len(dataset.tables) == 1
コード例 #20
0
def detect_classifier(ce_matrix):
    cbn = load_xml_to_cbn(
        os.path.join(src_path,
                     '../data/synthetic/ProbabilisticBayesianModel.xml'))
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']

    for i in [0, 1, 2, 3]:  # two datasets generated by two methods
        test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i))
        for j, label in enumerate(['LR', 'SVM']):  # two classifiers
            # modify cpt of label before detect
            for a, n, m, s, y in product(A.domains.get_all(),
                                         N.domains.get_all(),
                                         M.domains.get_all(),
                                         S.domains.get_all(),
                                         Y.domains.get_all()):
                cbn.set_conditional_prob(
                    Event({Y: y}), Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s
                    }),
                    test.get_conditional_prob(
                        Event({label: y}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        })))
            cbn.build_joint_table()

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                p_u, p_l = detect_after_remove(cbn=cbn,
                                               s=spos,
                                               sprime=sneg,
                                               y=1,
                                               aprime=aprime,
                                               mprime=mprime)
                p = detect_after_remove(cbn=cbn,
                                        s=sneg,
                                        sprime=sneg,
                                        y=1,
                                        aprime=aprime,
                                        mprime=mprime)
                ce_matrix.iloc[j * 8 + k, 3 * i:3 * i + 2] = [p_u - p, p_l - p]

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                p_u, p_l = detect_after_remove(cbn=cbn,
                                               s=sneg,
                                               sprime=spos,
                                               y=1,
                                               aprime=aprime,
                                               mprime=mprime)
                p = detect_after_remove(cbn=cbn,
                                        s=spos,
                                        sprime=spos,
                                        y=1,
                                        aprime=aprime,
                                        mprime=mprime)
                ce_matrix.iloc[j * 8 + k + 4,
                               3 * i:3 * i + 2] = [p_u - p, p_l - p]
コード例 #21
0
ファイル: test_dataset.py プロジェクト: ayush1999/dave
def test_init(s):
    dataset = DataSet(s)
    assert dataset
    assert dataset.id == s
コード例 #22
0
def pearl_detect_classifier(ce_matrix):
    cbn = load_xml_to_cbn(
        os.path.join(src_path,
                     '../data/synthetic/DeterministicBayesianModel.xml'))
    UA = cbn.v['UA']
    UN = cbn.v['UN']
    UM = cbn.v['UM']
    US = cbn.v['US']
    UY = cbn.v['UY']
    A = cbn.v['A']
    S = cbn.v['S']
    N = cbn.v['N']
    M = cbn.v['M']
    Y = cbn.v['Y']

    cbn.build_joint_table()
    event = cbn.jpt.groupby(
        Event({
            UA: 1,
            UN: 1,
            UM: 1,
            US: 1,
            A: 1,
            M: 1,
            S: 1
        }).keys())
    condition = cbn.jpt.groupby(Event({A: 1, M: 1, S: 1}).keys())

    def pearl_after_remove_(s, sprime, y, aprime, mprime):
        p = 0.0
        for ua, un, um, us in product(UA.domains.get_all(),
                                      UN.domains.get_all(),
                                      UM.domains.get_all(),
                                      US.domains.get_all()):
            e = Event({
                UA: ua,
                UN: un,
                UM: um,
                US: us,
                A: aprime,
                M: mprime,
                S: sprime
            })
            c = Event({A: aprime, M: mprime, S: sprime})
            ps = event.get_group(tuple(
                e.values()))['prob'].sum() / condition.get_group(
                    tuple(c.values()))['prob'].sum()

            for a, n, m in product(A.domains.get_all(), N.domains.get_all(),
                                   M.domains.get_all()):
                p += cbn.find_prob (Event ({A: a}), Event ({UA: ua})) * \
                  cbn.find_prob (Event ({M: m}), Event ({S: s, A: a, UM: um})) * \
                  cbn.find_prob (Event ({N: n}), Event ({S: s, A: a, UN: un})) * \
                  cbn.find_prob (Event ({Y: y}), Event ({S: s, A: a, N: n, M: m, UY: 1})) * \
                  ps
        return p

    for i in [0, 1, 2, 3]:  # two datasets generated by two methods
        test = DataSet(pd.read_csv('temp/synthetic_test_prediction%d.csv' % i))
        for j, label in enumerate(['LR', 'SVM']):  # two classifiers
            # modify cpt of label before detect
            for a, n, m, s, y in product(A.domains.get_all(),
                                         N.domains.get_all(),
                                         M.domains.get_all(),
                                         S.domains.get_all(),
                                         Y.domains.get_all()):
                cbn.set_conditional_prob(
                    Event({Y: y}), Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        UY: 1
                    }),
                    test.get_conditional_prob(
                        Event({label: y}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        })))

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                ce = pearl_after_remove_ (s=spos, sprime=sneg, y=1, aprime=aprime, mprime=mprime) - \
                  pearl_after_remove_ (s=sneg, sprime=sneg, y=1, aprime=aprime, mprime=mprime)
                ce_matrix.iloc[j * 8 + k, 3 * i + 2] = ce

            for k, (aprime, mprime) in enumerate(product([0, 1], [0, 1])):
                ce = pearl_after_remove_ (s=sneg, sprime=spos, y=1, aprime=aprime, mprime=mprime) - \
                  pearl_after_remove_ (s=spos, sprime=spos, y=1, aprime=aprime, mprime=mprime)
                ce_matrix.iloc[j * 8 + k + 4, 3 * i + 2] = ce
コード例 #23
0
def method3(acc_matrix):
	df_train = pd.read_csv ('temp/adult_binary_train_prediction0.csv')
	# df_train = pd.concat ([df_train] * 10, ignore_index=True)
	train = DataSet (df_train)
	df_test = pd.read_csv ('temp/adult_binary_test_prediction0.csv')
	df_test = pd.concat ([df_test] * 3, ignore_index=True)
	test = DataSet (df_test)
	acc = []

	for name in ['LR', 'SVM']:
		probabilistic_cbn = load_xml_to_cbn (os.path.join (src_path, '../data/adult/adult.xml'))

		def find_condition_prob(e, t):
			return probabilistic_cbn.find_prob (e, t)

		def get_loc(e):
			return probabilistic_cbn.get_loc (e)

		A1 = probabilistic_cbn.v['age']
		A2 = probabilistic_cbn.v['education']
		S = probabilistic_cbn.v['sex']
		M1 = probabilistic_cbn.v['workclass']
		M2 = probabilistic_cbn.v['marital-status']
		N = probabilistic_cbn.v['hours']
		Y = probabilistic_cbn.v['income']

		YH = Variable (name=name, index=Y.index + 1, domains=Y.domains)
		probabilistic_cbn.v[(YH.index, YH.name)] = YH

		YT = Variable (name=name + "M", index=Y.index + 2, domains=Y.domains)
		probabilistic_cbn.v[(YT.index, YT.name)] = YT

		# build linear loss function
		C_vector = np.zeros ((2 ** 8 + 2 ** 8 // 4, 1))
		for a1, a2, n, m1, m2, s in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
											 S.domains.get_all ()):
			p_x_s = train.get_marginal_prob (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh_1_y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), 'notequal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 0}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 1}))
			C_vector[loc] = p_yh_1_y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

			p_yh__y = p_x_s * train.count (Event ({Y: 0, YH: 0}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, N: n, S: s}), 'equal')
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 0, YT: 1}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 0}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
			loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: 1, YT: 0}))
			C_vector[loc] = p_yh__y * train.get_conditional_prob (Event ({YH: 1}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))

		# the inequality of max and min
		G_matrix_1 = np.zeros ((2 ** 8, 2 ** 8 + 2 ** 8 // 4))
		h_1 = np.zeros (2 ** 8)
		# max
		i = 0
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + loc] = -1
				i += 1
		# min
		assert i == 2 ** 8 // 2
		for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
			for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))
					G_matrix_1[i, loc] = -train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}))
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
				G_matrix_1[i, 2 ** 8 + 2 ** 8 // 8 + loc] = 1
				i += 1

		# build counterfactual fairness constraints
		G_matrix_2 = np.zeros ((2 ** 4 * 2, 2 ** 8 + 2 ** 8 // 4))
		h_2 = np.ones (2 ** 4 * 2) * tau

		i = 0
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		assert i == 2 ** 4
		for a1, a2, m1, m2 in product (A1.domains.get_all (), A2.domains.get_all (), M1.domains.get_all (), M2.domains.get_all ()):
			for n in N.domains.get_all ():
				loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: spos, YT: yt_pos}))
				G_matrix_2[i, 2 ** 8 + 2 ** 8 // 8 + loc] = -find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: spos}))

				for yh in YH.domains.get_all ():
					loc = get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg, YH: yh, YT: yt_pos}))
					G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A1: a1, A2: a2, M1: m1, M2: m2, S: sneg})) \
										 * train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: sneg}))
			i += 1

		###########

		# mapping in [0, 1]
		G_matrix_3 = np.zeros ((2 * (2 ** 8 + 2 ** 8 // 4), 2 ** 8 + 2 ** 8 // 4))
		h_3 = np.zeros (2 * (2 ** 8 + 2 ** 8 // 4))

		for i in range (2 ** 8 + 2 ** 8 // 4):
			G_matrix_3[i, i] = 1
			h_3[i] = 1

			G_matrix_3[2 ** 8 + 2 ** 8 // 4 + i, i] = -1
			h_3[2 ** 8 + 2 ** 8 // 4 + i] = 0

		# sum = 1
		A_matrix = np.zeros ((2 ** 8 // 2, 2 ** 8 + 2 ** 8 // 4))
		b = np.ones (2 ** 8 // 2)

		i = 0
		for a1, a2, n, m1, m2, s, yh in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
												 S.domains.get_all (),
												 YH.domains.get_all ()):
			for yt in YT.domains.get_all ():
				A_matrix[i, get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1
			i += 1

		assert i == 2 ** 8 // 2

		# combine the inequality constraints
		G_matrix = np.vstack ([G_matrix_1, G_matrix_2, G_matrix_3])
		h = np.hstack ([h_1, h_2, h_3])

		# Test
		# print (np.linalg.matrix_rank (A_matrix), A_matrix.shape[0])
		# print (np.linalg.matrix_rank (np.vstack ([A_matrix, G_matrix])), A_matrix.shape[1])

		# def check():
		# 	sol = np.zeros (2 ** 8 + 2 ** 8 // 4)
		# 	for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
		# 												 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
		# 		if yh.name == yt.name:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 1.0
		# 		else:
		# 			sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))] = 0.0
		#
		# 	for a1, a2, n, s, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), S.domains.get_all (), YT.domains.get_all ()):
		# 		p_min = 1
		# 		p_max = 0
		# 		for m1, m2 in product (M1.domains.get_all (), M2.domains.get_all ()):
		# 			p = 0.0
		# 			for yh in YH.domains.get_all ():
		# 				p = train.get_conditional_prob (Event ({YH: yh}), Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s})) \
		# 					* sol[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt}))]
		# 			if p < p_min:
		# 				p_min = p
		# 			if p > p_max:
		# 				p_max = p
		# 		loc = get_loc (Event ({A1: a1, A2: a2, N: n, S: s, YT: yt}))
		# 		sol[2 ** 8 + loc] = p_max
		# 		sol[2 ** 8 + 2 ** 8 // 8 + loc] = p_min
		#
		# 	np.dot (G_matrix_2, sol)

		# check ()

		# solver
		solvers.options['show_progress'] = False
		sol = solvers.lp (c=matrix (C_vector),
						  G=matrix (G_matrix),
						  h=matrix (h),
						  A=matrix (A_matrix),
						  b=matrix (b),
						  solver=solvers
						  )
		mapping = np.array (sol['x'])

		# build the post-processing result in training and testing
		train.df.loc[:, name + 'M'] = train.df[name]
		test.df[name + 'M'] = test.df[name]
		for a1, a2, n, m1, m2, s, yh, yt in product (A1.domains.get_all (), A2.domains.get_all (), N.domains.get_all (), M1.domains.get_all (), M2.domains.get_all (),
													 S.domains.get_all (), YH.domains.get_all (), YT.domains.get_all ()):
			if yh.name != yt.name:
				p = mapping[get_loc (Event ({A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s, YH: yh, YT: yt})), 0]
				train.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)
				test.random_assign (Event ({YH: yh, A1: a1, A2: a2, N: n, M1: m1, M2: m2, S: s}), Event ({YT: yt}), p)

		train.df[name] = train.df[name + 'M']
		train.df.drop ([name + 'M'], axis=1)
		test.df[name] = test.df[name + 'M']
		test.df.drop ([name + 'M'], axis=1)
		acc.append (accuracy_score (train.df[name], train.df[Y.name]))
		acc.append (accuracy_score (test.df[name], test.df[Y.name]))

	acc_matrix.iloc[:, 3] = acc
	train.df.to_csv ('temp/adult_binary_train_prediction3.csv', index=False)
	test.df.to_csv ('temp/adult_binary_test_prediction3.csv', index=False)
コード例 #24
0
from model.dataset import DataSet
import time


file_folder = '_'.join([str(i) for i in time.localtime(time.time())][:3])
dir_list = ['2020_7_9']
dataset = DataSet(dir_list)
attr_value = dataset.get_date_stock("2020_7_9", "光大证券", "现手")
print("len:", attr_value)

# 对某一天的所有股票按照交易买卖比进行聚类
コード例 #25
0
ファイル: test_dataset.py プロジェクト: StingraySoftware/dave
def test_add_table(s, t, c):
    dataset = DataSet(s)
    dataset.add_table(t, [c])
    assert len(dataset.tables) == 1
コード例 #26
0
def method3(acc_matrix):
    df_train = pd.read_csv('temp/synthetic_train_prediction0.csv')
    train = DataSet(df_train)
    df_test = pd.read_csv('temp/synthetic_test_prediction0.csv')
    test = DataSet(df_test)
    acc = []

    for name in ['LR', 'SVM']:
        probabilistic_cbn = load_xml_to_cbn(
            os.path.join(src_path,
                         '../data/synthetic/ProbabilisticBayesianModel.xml'))

        def find_condition_prob(e, t):
            return probabilistic_cbn.find_prob(e, t)

        def get_loc(e):
            return probabilistic_cbn.get_loc(e)

        A = probabilistic_cbn.v['A']
        S = probabilistic_cbn.v['S']
        N = probabilistic_cbn.v['N']
        M = probabilistic_cbn.v['M']
        Y = probabilistic_cbn.v['Y']

        YH = Variable(name='YH', index=Y.index + 1, domains=Y.domains)
        probabilistic_cbn.v[(YH.index, YH.name)] = YH

        YT = Variable(name='YT', index=Y.index + 2, domains=Y.domains)
        probabilistic_cbn.v[(YT.index, YT.name)] = YT

        # build linear loss function
        C_vector = np.zeros((2**6 + 2**6 // 2, 1))
        for a, n, m, s in product(A.domains.get_all(), N.domains.get_all(),
                                  M.domains.get_all(), S.domains.get_all()):
            p_x_s = train.get_marginal_prob(
                Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh_1_y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'notequal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 0}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 1}))
            C_vector[loc] = p_yh_1_y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

            p_yh__y = p_x_s * train.count(
                Event({
                    'Y': 0,
                    name: 0
                }), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }), 'equal')
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 0, YT: 1}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 0}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))
            loc = get_loc(Event({A: a, M: m, N: n, S: s, YH: 1, YT: 0}))
            C_vector[loc] = p_yh__y * train.get_conditional_prob(
                Event({name: 1}), Event({
                    'A': a,
                    'M': m,
                    'N': n,
                    'S': s
                }))

        # the inequality of max and min
        G_matrix_1 = np.zeros((2**6, 2**6 + 2**6 // 2))
        h_1 = np.zeros(2**6)
        # max
        i = 0
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + loc] = -1
                i += 1
        # min
        assert i == 2**6 // 2
        for a, n, s, yt in product(A.domains.get_all(), N.domains.get_all(),
                                   S.domains.get_all(), YT.domains.get_all()):
            for m in M.domains.get_all():
                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: s,
                            YH: yh,
                            YT: yt
                        }))
                    G_matrix_1[i, loc] = -train.get_conditional_prob(
                        Event({name: yh}),
                        Event({
                            'A': a,
                            'M': m,
                            'N': n,
                            'S': s
                        }))
                loc = get_loc(Event({A: a, N: n, S: s, YT: yt}))
                G_matrix_1[i, 2**6 + 2**6 // 4 + loc] = 1
                i += 1

        # build counterfactual fairness constraints
        G_matrix_2 = np.zeros((2**2 * 2, 2**6 + 2**6 // 2))
        h_2 = np.ones(2**2 * 2) * tau

        i = 0
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + loc] = find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = -find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        assert i == 2**2
        for a, m in product(A.domains.get_all(), M.domains.get_all()):
            for n in N.domains.get_all():
                loc = get_loc(Event({A: a, N: n, S: spos, YT: yt_pos}))
                G_matrix_2[i, 2**6 + 2**6 // 4 + loc] = -find_condition_prob(
                    Event({N: n}), Event({
                        A: a,
                        S: spos
                    }))

                for yh in YH.domains.get_all():
                    loc = get_loc(
                        Event({
                            A: a,
                            M: m,
                            N: n,
                            S: sneg,
                            YH: yh,
                            YT: yt_pos
                        }))
                    G_matrix_2[i, loc] = find_condition_prob (Event ({N: n}), Event ({A: a, S: sneg})) \
                          * train.get_conditional_prob (Event ({name: yh}), Event ({'A': a, 'M': m, 'N': n, 'S': sneg}))
            i += 1

        ###########

        # mapping in [0, 1]
        G_matrix_3 = np.zeros(((2**6 + 2**6 // 2) * 2, 2**6 + 2**6 // 2))
        h_3 = np.zeros((2**6 + 2**6 // 2) * 2)

        for i in range(2**6 + 2**6 // 2):
            G_matrix_3[i, i] = 1
            h_3[i] = 1

            G_matrix_3[2**6 + 2**6 // 2 + i, i] = -1
            h_3[2**6 + 2**6 // 2 + i] = 0

        # sum = 1
        A_matrix = np.zeros((2**6 // 2, 2**6 + 2**6 // 2))
        b = np.ones(2**6 // 2)

        i = 0
        for a, n, m, s, yh in product(A.domains.get_all(), N.domains.get_all(),
                                      M.domains.get_all(), S.domains.get_all(),
                                      YH.domains.get_all()):
            for yt in YT.domains.get_all():
                A_matrix[
                    i,
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    }))] = 1
            i += 1

        assert i == 2**6 // 2

        # combine the inequality constraints
        G_matrix = np.vstack([G_matrix_1, G_matrix_2, G_matrix_3])
        h = np.hstack([h_1, h_2, h_3])

        # solver
        solvers.options['show_progress'] = False
        sol = solvers.lp(c=matrix(C_vector),
                         G=matrix(G_matrix),
                         h=matrix(h),
                         A=matrix(A_matrix),
                         b=matrix(b),
                         solver=solvers)
        mapping = np.array(sol['x'])

        # build the post-processing result in training and testing
        train.df[name + '1'] = train.df[name]
        test.df[name + '1'] = test.df[name]
        for a, n, m, s, yh, yt in product(A.domains.get_all(),
                                          N.domains.get_all(),
                                          M.domains.get_all(),
                                          S.domains.get_all(),
                                          YH.domains.get_all(),
                                          YT.domains.get_all()):
            if yh.name != yt.name:
                p = mapping[
                    get_loc(Event({
                        A: a,
                        M: m,
                        N: n,
                        S: s,
                        YH: yh,
                        YT: yt
                    })), 0]
                train.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)
                test.random_assign(
                    Event({
                        name: yh,
                        'A': a,
                        'M': m,
                        'N': n,
                        'S': s
                    }), Event({name + '1': yt}), p)

        train.df[name] = train.df[name + '1']
        train.df.drop([name + '1'], axis=1)
        test.df[name] = test.df[name + '1']
        test.df.drop([name + '1'], axis=1)
        acc.append(accuracy_score(train.df['Y'], train.df[name]))
        acc.append(accuracy_score(test.df['Y'], test.df[name]))

    acc_matrix.iloc[:, 3] = acc

    train.df.to_csv('temp/synthetic_train_prediction3.csv', index=False)
    test.df.to_csv('temp/synthetic_test_prediction3.csv', index=False)
コード例 #27
0
ファイル: test_dataset.py プロジェクト: swapsha96/dave
def test_init(s):
    dataset = DataSet(s)
    assert dataset
    assert len(dataset.id) > len(s)
コード例 #28
0
def analyse_vocab(rouge, datasets=None, topics=None):
    if datasets is None:
        return

    concept_type = ("parse", "ngrams")
    embedding_variants = ("google.neg.300d", "glove.6B.300d", "tudarmstadt_german")
    # concept_type = None
    topic_details = []
    concept_details = []
    # else:
    #     embeddings = load_w2v_embeddings(embeddings_path, language, oracle)
    token_details = []
    embeddings_path = path.normpath(path.join(args.iobasedir, "embeddings"))

    for dataset, concept_type, embedding_variant in itertools.product(datasets, concept_type, embedding_variants):
        print("running analysis for ", dataset, concept_type, embedding_variant,
              "--------------------------------------")
        i = 0
        ds = resolve_against_iobase(dataset, args.iobasedir)
        d = DataSet(ds)
        language = d.get_language()
        embeddings = load_w2v_by_name(embeddings_path, variant=embedding_variant)
        for topic in d.get_topics():
            # if i > 2:
            #     continue
            sumewrap = SumeWrap(language=language)
            i += 1
            docs = topic.get_docs()
            summaries = topic.get_models()

            parse_info = topic.get_parse_info(0)

            sf = SimulatedFeedback(language, rouge, embeddings=embeddings, docs=docs, models=summaries,
                               summary_length=100, oracle_type="active_learning", ub_score=(1, 1, 1),
                               ub_summary=" ", parser_type=concept_type)
            # sf.run_full_simulation(max_iteration_count=0)

            doc_sentences = sf.summarizer.sentences

            summaries_parse_info = [list(topic.get_models(parsed=True)), list(topic.get_models(parsed=True))]
            if concept_type is "parse":
                sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries, parse_type=concept_type,
                                                                    parse_info=list(summaries_parse_info))
                sumewrap.s.extract_ngrams2(concept_type="phrase")
            else:
                sumewrap.s.sentences = sumewrap.load_sume_sentences(summaries)
                sumewrap.s.extract_ngrams2()
            sumewrap.s.compute_document_frequency()
            model_sentences = sumewrap.s.sentences

            #
            #  token_details
            #
            for s in doc_sentences:
                sentence_pos = s.position
                doc_id = s.doc_id
                token_from_summary = False
                token_from_document = True
                for concept in s.concepts:
                    ngrams = concept.split(' ')
                    for token in ngrams:
                        pos = "UNK"
                        try:
                            word, pos = s.tokens_pos[token].split('::')
                        except:
                            token = re.sub('[-\.](\s|$)', '\\1', token)
                            try:
                                word, pos = s.tokens_pos[concept].split('::')
                            except:
                                word, pos = token, 'NN'
                        token_details.append({
                            "sentence_pos": sentence_pos,
                            "doc_id": doc_id,
                            "topic": topic.get_name(),
                            "dataset": d.get_name(),
                            "language": d.get_language(),
                            "token": token,
                            "word": word,
                            "pos_tag": pos,
                            "from_summary": token_from_summary,
                            "from_document": token_from_document,
                            "concept_type": concept_type,
                            "embedding_variant": embedding_variant,
                            "token_has_embedding": embeddings.isKnown(token),
                            "word_has_embedding": embeddings.isKnown(word)
                        })
            for s in model_sentences:
                sentence_pos = s.position
                doc_id = s.doc_id
                token_from_summary = True
                token_from_document = False
                for concept in s.concepts:
                    ngrams = concept.split(' ')
                    for token in ngrams:
                        pos = "UNK"
                        try:
                            word, pos = s.tokens_pos[token].split('::')
                        except:
                            token = re.sub('[-\.](\s|$)', '\\1', token)
                            try:
                                word, pos = s.tokens_pos[concept].split('::')
                            except:
                                word, pos = token, 'NN'

                        token_details.append({
                            "sentence_pos": sentence_pos,
                            "doc_id": doc_id,
                            "topic": topic.get_name(),
                            "dataset": d.get_name(),
                            "language": d.get_language(),
                            "token": token,
                            "word": word,
                            "pos_tag": pos,
                            "from_summary": token_from_summary,
                            "from_document": token_from_document,
                            "concept_type": concept_type,
                            "embedding_variant": embedding_variant,
                            "token_has_embedding": embeddings.isKnown(token),
                            "word_has_embedding": embeddings.isKnown(word)
                        })

    # post-process token details
    token_df = pd.DataFrame(token_details)
    # token_df.groupby("dataset")
    # print(token_df.head())
    filename = "C:\\Users\\hatieke\\.ukpsummarizer\\tmp\\tokens_new.csv"
    print("saving token_df to ", filename)
    token_df.to_csv(filename, encoding="UTF-8")
コード例 #29
0
        if args.pickleout is None:
            pickleout=None
        else:
            pickleout = resolve_filename(args.pickleout.replace("\"",""), base=iobasedir)

        runner.single_iteration(picklein=picklein, pickleout=pickleout,
                                feedbacks=js)

    elif args.command == 'summarize':

        # check if the path refers to a dataset, a topic or a sole model:
        queue = []
        f = utils.reader.resolve_against_iobase(args.file, iobasedir)
        if path.exists(path.join(f, "index.json")):
            # is_dataset
            d = DataSet(f)
            # unroll to get topics
            for t in d.get_topics():
                for (mf, mt) in t.get_models():
                    mf = path.normpath(mf)
                    pref = path.commonprefix([mf, iobasedir])
                    tn = mf[len(pref) + 1:]
                    print("shortened:", tn)
                    queue.append(mf)

                    # topics.append([t.get_name for t in d.get_topics()])

        elif path.exists(path.join(f, "task.json")):
            # is topic
            t = Topic(f)
            for (mf, mt) in t.get_models():