Ejemplo n.º 1
0
def read_rdf(ont1, ont2):

    largebio_data_processed_path = 'data/df_largebio_{}_{}.csv'.format(
        ont1, ont2)
    largebio_ref_processed_path = 'data/df_largebio_{}_{}_ref.csv'.format(
        ont1, ont2)

    if not os.path.isfile(largebio_data_processed_path):
        # Specify path for the alignments and reference alignments
        res_dir = os.path.join("data", "largebio-results-2019")
        ref_path = os.path.join(
            "data", "oaei2019_umls_flagged_reference",
            "oaei_{}_{}_mappings_with_flagged_repairs.rdf".format(ont1, ont2))
        # Load rdf data
        df_data, df_ref = u.load_rdf('largebio', res_dir, ref_path, ont1, ont2)

        # Negative sampling
        df_data = u.negative_sampling_target(lb_measures, df_data, df_ref)

        # Save results to csv
        largebio_data_processed_path = 'data/df_largebio_{}_{}.csv'.format(
            ont1, ont2)
        largebio_ref_processed_path = 'data/df_largebio_{}_{}_ref.csv'.format(
            ont1, ont2)

        # Save results to csv
        df_data.to_csv(largebio_data_processed_path, index=False)
        df_ref.to_csv(largebio_ref_processed_path, index=False)

    else:
        print('File already exists')
        df_data = pd.read_csv(largebio_data_processed_path)
        df_ref = pd.read_csv(largebio_ref_processed_path)

    return df_data, df_ref
Ejemplo n.º 2
0
    'measure_logmap', 'measure_logmaplt', 'measure_ontmat1', 'measure_sanom',
    'measure_wiktionary'
]

conference_data_processed_path = 'data/df_conference.csv'
res_dir = os.path.join('data', 'conference-data')

if not os.path.isfile(conference_data_processed_path):
    dfs_data, dfs_refs = [], []
    for ont1, ont2 in itertools.combinations(cf_ontologies, 2):
        ref_path = os.path.join(
            "data",
            "conference-ref-data",
            "{}-{}.rdf".format(ont1, ont2),
        )
        df_data, df_ref = u.load_rdf('conference', res_dir, ref_path, ont1,
                                     ont2)
        df_data = u.negative_sampling_target(cf_measures, df_data, df_ref)
        df_data["ontologies"] = f"{ont1}-{ont2}"
        dfs_data.append(df_data)
        dfs_refs.append(df_ref)

    df_conf = pd.concat(dfs_data, ignore_index=True)
    df_conf.to_csv(conference_data_processed_path, index=False)
else:
    df_conf = pd.read_csv(conference_data_processed_path)

X_cf, y_cf = df_conf[cf_measures], df_conf['label']

#fill missing values with 0
X_cf = X_cf.fillna(0)
#binary features