Esempio n. 1
0
    def find_pair(
        self, left_df: pd.DataFrame, right_df: pd.DataFrame
    ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        pairs_list = []
        pairs_column = [[] for _ in range(right_df.shape[0])]
        for r1, r2 in pairs:
            pairs_column[int(r2.id)].append(int(r1.id))
            pairs_list.append((r1.id, r2.id))

        right_df["joining_pairs"] = pairs_column
        return right_df, pairs_list
Esempio n. 2
0
    def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]], left_metadata: dict,
             right_metadata: dict) -> JoinResult:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))
        left_df = left_df.set_index('id')
        right_df = right_df.set_index('id')

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        df_joined = pd.DataFrame()

        column_names_to_join = None
        for r1, r2 in pairs:
            left_res = left_df.loc[r1.id]
            right_res = right_df.loc[r2.id]
            if column_names_to_join is None:
                column_names_to_join = right_res.index.difference(
                    left_res.index)
                matched_rows = right_res.index.intersection(left_res.index)
                columns_new = left_res.index.tolist()
                columns_new.extend(column_names_to_join.tolist())
            new = pd.concat([left_res, right_res[column_names_to_join]])
            df_joined = df_joined.append(new, ignore_index=True)
        # ensure that the original dataframe columns are at the first left part
        df_joined = df_joined[columns_new]

        return JoinResult(df_joined, matched_rows)
Esempio n. 3
0
import pandas as pd
import rltk

print('from dataframe...')

df = pd.read_csv('ds1.csv', encoding='latin-1')
df['id'] = df['doc_id'].astype('str')


class DFRecord(rltk.AutoGeneratedRecord):
    pass


ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord)
for r in ds:
    print(r.id, r.doc_id, r.doc_value)

print('set id column...')


@rltk.set_id('col1', function_=lambda x: str(x), keep_original=True)
class DFRecord2(rltk.AutoGeneratedRecord):
    pass


df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2)
for r in ds:
    print(r.id, r.col1, r.col2)
Esempio n. 4
0
        return self.raw_object['id']

    @property
    def address(self):
        return self.raw_object['Address']

    @property
    def phone(self):
        return self.raw_object['Phone']

    @property
    def cuisine(self):
        return self.raw_object['Cuisine']


ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1),
                   record_class=Record1,
                   adapter=rltk.MemoryKeyValueAdapter())
ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2),
                   record_class=Record2,
                   adapter=rltk.MemoryKeyValueAdapter())
'''bg = rltk.HashBlockGenerator()
blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine'))
pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)'''

pairs = rltk.get_record_pairs(ds1, ds2)

f = open('similarities.txt', 'w+')

for r1, r2 in pairs:
def process():

    df_entity = pd.DataFrame()

    logger.info('loading entity dataframes')
    for infile in glob.glob(
            os.path.join(config['temp_dir'], config['run_name'],
                         '*/*.entity.h5')):
        source = os.path.basename(infile).split('.')[0]
        df_entity = df_entity.append(pd.read_hdf(infile))
    df_entity = df_entity.reset_index(drop=True)
    logger.info('Total number of entities: %d', len(df_entity))
    df_entity['type'] = df_entity['type'].apply(lambda x: x[
        0])  # only pick the fist type (compatible with old pipeline)
    df_entity_ori = df_entity.copy()

    ### filtering
    logger.info('filtering out some entity types')
    all_types = set(df_entity['type'])
    # all_types = set([t for tu in df_entity['type'] for t in tu])  # multi-type support
    selected_types = filter(
        lambda x: x.startswith(
            ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')),
        all_types)
    df_entity = df_entity.loc[df_entity['type'].isin(selected_types)]
    # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support
    df_entity = df_entity[df_entity['name'].notnull()]
    df_entity = df_entity.where(pd.notnull(df_entity), None)
    df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])]

    ### generate rltk components
    logger.info('generating rltk components')
    ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity),
                      record_class=GaiaRecord)
    bg_kb = rltk.TokenBlocker()
    blocks_kb = bg_kb.block(ds,
                            function_=lambda r: list(r.selected_target)
                            if r.selected_target else ['None'])
    bg_fb = rltk.TokenBlocker()
    blocks_fb = bg_fb.block(ds,
                            function_=lambda r: r.selected_fbid
                            if r.selected_fbid else ['None'])

    ### clustering
    logger.info('clustering entity')
    # build cluster based on type
    all_clusters = []
    for bid, data in blocks_kb.key_set_adapter:
        if bid == 'None':
            continue

        c = Cluster(ds)
        for _, r_id in data:
            r = ds.get_record(r_id)
            for id_ in r.selected_target:
                c.kb_id.add(id_)
            if r.fbid:
                for id_ in r.selected_fbid:
                    c.fb_id.add(id_)
            if r.wikidata:
                for id_ in r.selected_wikidata:
                    c.wd_id.add(id_)
            c.add(r)
        all_clusters.append(c)

    # fb only clusters
    fb_only_clusters = {}
    for bid, data in blocks_fb.key_set_adapter:
        if bid == 'None':
            continue

        fb_only_clusters[bid] = set()
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.selected_target:
                continue
            fb_only_clusters[bid].add(r_id)
        if len(fb_only_clusters[bid]) == 0:
            del fb_only_clusters[bid]

    for bid, cluster in fb_only_clusters.items():
        c = Cluster(ds)
        for r_id in cluster:
            c.add(r_id)
            r = ds.get_record(r_id)
            if r.fbid:
                for id_ in r.selected_fbid:
                    c.fb_id.add(id_)
            if r.wikidata:
                for id_ in r.selected_wikidata:
                    c.wd_id.add(id_)
        all_clusters.append(c)

    # validation
    for idx, c in enumerate(all_clusters):
        if len(c.kb_id) > 1:
            logger.error('mulitple kb_ids in cluster: %s', c.kb_id)
            break

        kb_ids = set()
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            if r.selected_target:
                for id_ in r.selected_target:
                    kb_ids.add(id_)
        if len(kb_ids) > 1:
            logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id)
            break

    # split based on types
    all_clusters_splitted = []
    for c in all_clusters:
        types = {}
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            type_ = normalize_type(r.type)
            if type_ not in types:
                cc = Cluster(ds)
                cc.type = type_
                types[type_] = cc

            cc = types[type_]
            cc.add(r_id)
            if r.selected_target:
                for id_ in r.selected_target:
                    cc.kb_id.add(id_)
            if r.selected_fbid:
                for id_ in r.selected_fbid:
                    cc.fb_id.add(id_)
            if r.selected_wikidata:
                for id_ in r.selected_wikidata:
                    cc.wd_id.add(id_)
        for cc in types.values():
            all_clusters_splitted.append(cc)

    # merge singleton
    final_clusters = deepcopy(all_clusters_splitted)
    MIN_SIM = 0.4
    clustered_entity_ids = set(
        [r for c in all_clusters for r in c.all_records])

    for _, e in df_entity['e'].items():
        if e not in clustered_entity_ids:
            r = ds.get_record(e)
            r_type = normalize_type(r.type)
            local_best = [None,
                          0]  # first item: cluster id, second item: score
            for c in final_clusters:
                sim = c.similarity(r)
                if r_type != c.type:
                    continue
                if sim >= MIN_SIM:
                    if sim > local_best[1]:
                        local_best = [c, sim]

            c = local_best[0]
            if c is not None:
                c.add(r, contribute=False)
            else:
                # still singleton, construct singleton cluster
                c = Cluster(ds)
                c.type = r_type
                c.add(r)
                final_clusters.append(c)

    # filtered-out entities
    # create cluster with fake record
    for _, e in df_entity_left.iterrows():
        c = Cluster(None)
        c.type = normalize_type(e['type'])
        c.add(e['e'], contribute=False)
        final_clusters.append(c)
    logger.info('Total number of clusters: %d', len(final_clusters))

    # create entity to cluster mapping
    entity_to_cluster = defaultdict(list)
    for c in final_clusters:
        for r in c.all_records:
            entity_to_cluster[r].append(c)
    for e, c in entity_to_cluster.items():
        if len(c) > 1:
            logger.error('Entity in multiple clusters detected, entity id: %s',
                         e)

    ### generate cluster properties
    logger.info('generating cluster properties')
    for c in final_clusters:
        c.generate()

    ### export
    logger.info('exporting clusters')
    df_entity_cluster = df_entity_ori.copy()
    df_entity_cluster['cluster'] = None
    df_entity_cluster['synthetic'] = False

    logger.info('updating cluster info for each entity')
    for idx, e in df_entity_cluster['e'].items():
        clusters = tuple(set([c.full_id for c in entity_to_cluster[e]]))
        df_entity_cluster.at[idx, 'cluster'] = clusters

    logger.info('creating prototypes')
    proto_to_cluster_mapping = {}
    for c in final_clusters:
        proto_to_cluster_mapping[c.feature_entity_id] = c
    proto_dict = []
    for idx, row in df_entity_cluster.iterrows():
        eid = row['e']
        if eid not in proto_to_cluster_mapping:
            # not a prototype
            continue
        c = proto_to_cluster_mapping[eid]
        # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0]
        row = row.to_dict()
        row['synthetic'] = True
        row['cluster'] = tuple([c.full_id])
        row['e'] = c.prototype
        proto_dict.append(row)
    df_prototypes = pd.DataFrame.from_dict(proto_dict)

    logger.info('appending dataframes')
    df_complete_entity_clusters = df_entity_cluster.append(df_prototypes)
    df_complete_entity_clusters.reset_index(drop=True)

    logger.info('writing to disk')
    output_file = os.path.join(config['temp_dir'], config['run_name'],
                               'entity_cluster.h5')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        df_complete_entity_clusters.to_hdf(output_file,
                                           'entity',
                                           mode='w',
                                           format='fixed')
        df_complete_entity_clusters.to_csv(output_file + '.csv')
Esempio n. 6
0
def process():

    df_entity = pd.DataFrame()
    df_event = pd.DataFrame()
    df_event_role = pd.DataFrame()
    df_relation = pd.DataFrame()
    df_relation_role = pd.DataFrame()

    logger.info('loading entity dataframes')
    for infile in glob.glob(
            os.path.join(config['temp_dir'], config['run_name'],
                         '*/*.entity.h5')):
        source = os.path.basename(infile).split('.')[0]
        # entity
        df_entity = df_entity.append(pd.read_hdf(infile))
        # event
        event_file = infile[:-len('entity.h5')] + 'event.h5'
        df_event = df_event.append(pd.read_hdf(event_file))
        event_role_file = infile[:-len('entity.h5')] + 'event_role.h5'
        df_event_role = df_event_role.append(pd.read_hdf(event_role_file))
        # relation
        relation_file = infile[:-len('entity.h5')] + 'relation.h5'
        df_relation = df_relation.append(pd.read_hdf(relation_file))
        relation_role_file = infile[:-len('entity.h5')] + 'relation_role.h5'
        df_relation_role = df_relation_role.append(
            pd.read_hdf(relation_role_file))
    logger.info('Read in {} entities, {} events, {} relations'.format(
        len(df_entity), len(df_event), len(df_relation)))
    df_entity = df_entity.drop_duplicates(
        subset=['e'],
        keep='last')  # cmu data has cross document entities, only keep one
    df_entity = df_entity.reset_index(drop=True)
    df_entity['type'] = df_entity['type'].apply(lambda x: x[
        0])  # only pick the fist type (compatible with old pipeline)
    df_entity_ori = df_entity.copy()
    df_event = df_event.drop_duplicates(subset=['e'],
                                        keep='last').reset_index(drop=True)
    df_event_role = df_event_role.drop_duplicates().reset_index(drop=True)
    df_relation = df_relation.drop_duplicates().reset_index(drop=True)
    df_relation_role = df_relation_role.drop_duplicates().reset_index(
        drop=True)
    logger.info(
        'After deduplication: {} entities, {} events, {} relations'.format(
            len(df_entity), len(df_event), len(df_relation)))

    ### filtering
    logger.info('filtering out some entity types')
    all_types = set(df_entity['type'])
    # all_types = set([t for tu in df_entity['type'] for t in tu])  # multi-type support
    selected_types = filter(
        lambda x: x.startswith(
            ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')),
        all_types)
    df_entity = df_entity.loc[df_entity['type'].isin(selected_types)]
    # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support
    df_entity = df_entity[df_entity['name'].notnull()]
    df_entity = df_entity.where(pd.notnull(df_entity), None)
    df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])]

    ### generate rltk components
    logger.info('generating rltk components')
    ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity),
                      record_class=GaiaRecord)
    # for r in ds:
    #     print(r.concatenated_labels)
    #     print(r.name, r.target, r.wikidata, r.selected_target_index, r.selected_wikidata_index)
    bg_kb = rltk.TokenBlocker()
    blocks_kb = bg_kb.block(ds,
                            function_=lambda r: [r.selected_target]
                            if r.selected_target else ['None'])
    bg_wd = rltk.TokenBlocker()
    blocks_wd = bg_wd.block(ds,
                            function_=lambda r: [r.selected_wikidata]
                            if r.selected_wikidata else ['None'])

    ### clustering
    logger.info('clustering entity')
    # build cluster based on type
    all_clusters = []
    for bid, data in blocks_kb.key_set_adapter:
        if bid == 'None':
            continue

        c = Cluster(ds)
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.target and not c.kb_id:
                c.kb_id = r.selected_target
                c.kb_labels = set(r.selected_target_labels)
            if r.wikidata:
                if r.selected_wikidata not in c.wd_candidate:
                    c.wd_candidate[r.selected_wikidata] = set(
                        r.selected_wikidata_labels)
            c.add(r)
        c.elect_wd_id()
        all_clusters.append(c)

    # find all wd only blocks
    wd_only_clusters = {}
    for bid, data in blocks_wd.key_set_adapter:
        if bid == 'None':
            continue

        wd_only_clusters[bid] = set()
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.selected_target:
                continue
            wd_only_clusters[bid].add(r_id)
        if len(wd_only_clusters[bid]) == 0:
            del wd_only_clusters[bid]

    # if wd block overlaps with kb clusters
    for c in all_clusters:
        if c.wd_id and c.wd_id in wd_only_clusters:
            for r in wd_only_clusters[c.wd_id]:
                c.add(r)
            del wd_only_clusters[c.wd_id]

    # construct clusters based on blocks
    for bid, cluster in wd_only_clusters.items():
        c = Cluster(ds)
        for r_id in cluster:
            c.add(r_id)
            r = ds.get_record(r_id)
            if not c.wd_id:
                c.wd_id = r.selected_wikidata
                c.wd_labels = set(r.selected_wikidata_labels)
        all_clusters.append(c)

    # validation
    # for idx, c in enumerate(all_clusters):
    #     if len(c.kb_id) > 1:
    #         logger.error('mulitple kb_ids in cluster: %s', c.kb_id)
    #         break
    #
    #     kb_ids = set()
    #     for r_id in c.all_records:
    #         r = ds.get_record(r_id)
    #         if r.selected_target:
    #             for id_ in r.selected_target:
    #                 kb_ids.add(id_)
    #     if len(kb_ids) > 1:
    #         logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id)
    #         break

    # split based on types
    all_clusters_splitted = []
    for c in all_clusters:
        types = {}
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            type_ = normalize_type(r.type)
            if type_ not in types:
                cc = Cluster(ds)
                cc.type = type_
                types[type_] = cc

            cc = types[type_]
            cc.add(r_id)
            cc.kb_id = c.kb_id
            cc.kb_labels = c.kb_labels
            cc.wd_id = c.wd_id
            cc.wd_labels = c.wd_labels

        for cc in types.values():
            all_clusters_splitted.append(cc)

    # merge singleton
    final_clusters = deepcopy(all_clusters_splitted)
    # MIN_SIM = 0.4
    clustered_entity_ids = set(
        [r for c in all_clusters for r in c.all_records])

    for _, e in df_entity['e'].items():
        if e not in clustered_entity_ids:
            r = ds.get_record(e)
            r_type = normalize_type(r.type)
            local_best = [None,
                          0]  # first item: cluster id, second item: score
            for c in final_clusters:
                sim = c.similarity(r)
                if r_type != c.type:
                    continue
                if sim > local_best[1]:
                    local_best = [c, sim]

            c = local_best[0]
            if c is not None:
                c.add(r)
            else:
                # still singleton, construct singleton cluster
                c = Cluster(ds)
                c.type = r_type
                c.add(r)
                c.name_labels = set(r.name)
                final_clusters.append(c)

    # filtered-out entities
    # create cluster with fake record
    for _, e in df_entity_left.iterrows():
        c = Cluster(None)
        c.type = normalize_type(e['type'])
        c.add(e['e'])
        final_clusters.append(c)
    logger.info('Total number of clusters: %d', len(final_clusters))

    # create entity to cluster mapping
    entity_to_cluster = defaultdict(list)
    for c in final_clusters:
        for r in c.all_records:
            entity_to_cluster[r].append(c)
    for e, c in entity_to_cluster.items():
        if len(c) > 1:
            logger.error('Entity in multiple clusters detected, entity id: %s',
                         e)
    entity_to_cluster = {e: c[0] for e, c in entity_to_cluster.items()}

    ### generate cluster properties
    logger.info('generating cluster properties')
    for c in final_clusters:
        c.generate()

    ### event and relation cluster
    # these clusters URIs will be {event/relation uri}-cluster
    # prototype URIs hence will be just {event/relation uri}

    ### event role
    event_role_se_dict = {
        'prototype1': [],
        'prototype2': [],
        'role': [],
        'just': []
    }
    for idx, v in df_event_role.iterrows():
        event_role_se_dict['prototype1'].append(v['event'])
        event_role_se_dict['prototype2'].append(
            entity_to_cluster[v['entity']].prototype)
        event_role_se_dict['role'].append(v['role'])
        event_role_se_dict['just'].append(v['just'])
    df_event_role_se = pd.DataFrame.from_dict(event_role_se_dict)

    ### relation role
    relation_role_se_dict = {
        'prototype1': [],
        'prototype2': [],
        'role': [],
        'just': []
    }
    for idx, v in df_relation_role.iterrows():
        relation_role_se_dict['prototype1'].append(v['relation'])
        if v['type'] == 'entity':
            relation_role_se_dict['prototype2'].append(
                entity_to_cluster[v['e']].prototype)
        elif v['type'] == 'event':
            relation_role_se_dict['prototype2'].append(v['e'])
        relation_role_se_dict['role'].append(v['role'])
        relation_role_se_dict['just'].append(v['just'])
    df_relation_role_se = pd.DataFrame.from_dict(relation_role_se_dict)

    ### export
    logger.info('exporting clusters')
    df_entity_cluster = df_entity_ori.copy()
    df_entity_cluster['cluster'] = None
    df_entity_cluster['synthetic'] = False
    df_entity_cluster['cluster_member_confidence'] = None

    logger.info('updating cluster info for each entity')
    for idx, e in df_entity_cluster['e'].items():
        clusters = [entity_to_cluster[e]]
        cluster_ids = tuple([c.full_id for c in clusters])
        confidences = tuple([c.member_confidence[e] for c in clusters])
        df_entity_cluster.at[idx, 'cluster'] = cluster_ids
        df_entity_cluster.at[idx, 'cluster_member_confidence'] = confidences

    logger.info('creating prototypes')
    proto_to_cluster_mapping = {}
    for c in final_clusters:
        proto_to_cluster_mapping[c.feature_entity_id] = c
    proto_dict = []
    for idx, row in df_entity_cluster.iterrows():
        eid = row['e']
        if eid not in proto_to_cluster_mapping:
            # not a prototype
            continue
        c = proto_to_cluster_mapping[eid]
        # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0]
        row = row.to_dict()
        row['synthetic'] = True
        row['cluster'] = tuple([c.full_id])
        row['e'] = c.prototype
        proto_dict.append(row)
    df_prototypes = pd.DataFrame.from_dict(proto_dict)

    logger.info('appending dataframes')
    df_complete_entity_clusters = df_entity_cluster.append(df_prototypes)
    df_complete_entity_clusters.reset_index(drop=True)

    logger.info('writing to disk')
    entity_cluster_output_file = os.path.join(config['temp_dir'],
                                              config['run_name'],
                                              'entity_cluster')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        df_complete_entity_clusters.to_hdf(entity_cluster_output_file + '.h5',
                                           'entity',
                                           mode='w',
                                           format='fixed')
        df_complete_entity_clusters.to_csv(entity_cluster_output_file +
                                           '.h5.csv')
    with open(entity_cluster_output_file + '.cluster.jl', 'w') as f:
        for c in final_clusters:
            f.write(json.dumps(c.debug()) + '\n')

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        # event
        event_cluster_output_file = os.path.join(config['temp_dir'],
                                                 config['run_name'],
                                                 'event_cluster.h5')
        df_event.to_hdf(event_cluster_output_file, 'event')
        event_role_output_file = os.path.join(config['temp_dir'],
                                              config['run_name'],
                                              'event_role.h5')
        df_event_role_se.to_hdf(event_role_output_file, 'event_role')
        df_event_role_se.to_csv(event_role_output_file + '.csv')
        # relation
        relation_cluster_output_file = os.path.join(config['temp_dir'],
                                                    config['run_name'],
                                                    'relation_cluster.h5')
        df_relation.to_hdf(relation_cluster_output_file, 'relation')
        relation_role_output_file = os.path.join(config['temp_dir'],
                                                 config['run_name'],
                                                 'relation_role.h5')
        df_relation_role_se.to_hdf(relation_role_output_file,
                                   'relation_role',
                                   mode='w',
                                   format='fixed')
        df_relation_role_se.to_csv(relation_role_output_file + '.csv')
from random import randrange


def random_date(start, end):
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)


selected = dict()
df_gt = pd.read_csv('../datasets/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv',
                    encoding='latin-1').sample(100)
df_gt = df_gt[df_gt['idDBLP'].map(len) < 50]
df_gt = df_gt[df_gt['idScholar'].map(len) < 50]
for d in rltk.DataFrameReader(df_gt):
    date = random_date(datetime(2018, 12, 20), datetime(2018, 12, 30)).date()
    selected[d['idDBLP']] = date
    selected[d['idScholar']] = date
df_gt.to_csv('dblp_scholar_gt.csv', index=False)

df_dblp = pd.read_csv('../datasets/DBLP-Scholar/DBLP1.csv', encoding='latin-1')
df_dblp_out = {'id': [], 'names': [], 'date': []}
for _, row in df_dblp.iterrows():
    # print(row['id'], row['authors'], row['year'])
    if row['id'] in selected:
        if not isinstance(row['authors'], str):
            continue
        df_dblp_out['id'].append(row['id'])
        df_dblp_out['names'].append(row['authors'])
        df_dblp_out['date'].append(selected[row['id']])
def main():
    with open("dblp_final_JSON.json", "r") as f:
        dblp_dict = json.load(f)

    professors = set()
    for key in dblp_dict:
        professors.add(key['person'])

    #print(professors)
    #print(len(professors))

    coauthor_dict = defaultdict(list)
    for key in dblp_dict:
        author = key['person']
        for items in key['papers']:
            co_authors = items['co_authors']
            if author in co_authors:
                co_authors.remove(author)
            if co_authors:
                coauthor_dict[author].extend(co_authors)

    list_of_coauthors = []
    for key in coauthor_dict:
        list_of_coauthors.extend(coauthor_dict[key])
    #print(len(list_of_coauthors))

    ### String / Data Matching for Entity linking using RLTK

    ### Remove duplicates in the coauthor_dict using String Matching
    ### Compare with professors and do entity linking / remove duplicates

    df1 = pd.DataFrame(list(professors), columns=['name'])
    #print(df1)
    df2 = pd.DataFrame(list_of_coauthors, columns=['name'])
    #print(len(df2))
    df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1)
    df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df1['id'] = (df1.index + 1).astype(str)

    #print(df1)
    df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1)
    df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df2['id'] = (df2.index + 1).astype(str)

    ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1),
                       record_class=Record1,
                       adapter=rltk.MemoryKeyValueAdapter())
    ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2),
                       record_class=Record2,
                       adapter=rltk.MemoryKeyValueAdapter())
    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(ds1, property_='fname'),
                        bg.block(ds2, property_='fname'))
    pairs = rltk.get_record_pairs(ds1, ds2, block=block)
    num_pairs = 0
    sim_pairs = []
    sim_dict = {}
    for r1, r2 in pairs:
        num_pairs += 1
        sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname)
        if 0.9 < sim < 1:
            sim_pairs.append(
                (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname))
            sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname
            #print(r1.lname,r2.lname,sim)
    #print(sim_pairs)
    #print("Blocking using Cuisine - Number of pairs:",num_pairs)
    for key in coauthor_dict:
        lis = coauthor_dict[key]
        for ind in range(len(lis)):
            if lis[ind] in sim_dict:
                lis[ind] = sim_dict[lis[ind]]

    with open("co_authors.json", "w") as jf:
        json.dump(coauthor_dict, jf, indent=2)