Ejemplo n.º 1
0
def create_idx_embeddings(keys, mat, datasets_directory):
    for filename in os.listdir(datasets_directory):
        name = filename.split('.')[0]
        print(f"Processing table {name}")
        check_emb = [i for i in keys if i.startswith(name)]
        if len(check_emb) != 0:
            df = pd.read_csv(datasets_directory + filename, na_filter=False)
            columns = df.columns.tolist()
            for idx, row in df.iterrows():
                embeddings = []
                for col in columns:
                    value = tokenize(str(row[col]))
                    for val in value.split('_'):
                        key = name + '.' + col + '#' + str(val)
                        if key in keys:
                            # Found Embedding
                            index_key = keys.index(key)
                            embeddings.append(mat[index_key])
                        else:
                            # Try with None:
                            key_with_none = name + '.' + col + '#' + str(None)
                            if key_with_none in keys:
                                # Found Embedding
                                index_key = keys.index(key_with_none)
                                embeddings.append(mat[index_key])
                if len(embeddings) > 1:
                    new_embedding_idx = name + '.idx#' + str(idx)
                    new_embedding = np.mean(embeddings, axis=0, keepdims=True)
                    keys.append(new_embedding_idx)
                    mat = np.append(mat, new_embedding, axis=0)
    return keys, mat
Ejemplo n.º 2
0
def fill_adjacency_matrix_relational(size, column_names, index_lookup, group, conf):
    # Retrieve all relation elements from the database
    group_elements_df = retrieve_relation_elements(group, conf)
    records = group_elements_df.to_records(index=False)
    group_elements = list(records)

    # Construct matrix and count vector
    A = lil_matrix((size, size))
    c_out = np.zeros(size)
    c_in = np.zeros(size)

    for (text_value1, text_value2) in group_elements:
        text_value1 = utils.tokenize(text_value1)
        text_value2 = utils.tokenize(text_value2)
        # Modified the following rows to support multi-words per token
        for val1 in text_value1.split('_'):
            for val2 in text_value2.split('_'):
                i = index_lookup[utils.get_label(column_names[0], val1)]
                j = index_lookup[utils.get_label(column_names[1], val2)]
                A[i, j] = 1
                c_out[i] = 1
                c_in[j] = 1
    return csr_matrix(A), (c_in + c_out)
Ejemplo n.º 3
0
def get_row_groups(df_vectors, graph, conf):
    print("Row relation extraction started...")
    result = dict()
    for node in graph.nodes:
        columns = graph.nodes[node]['columns']
        df_node = pd.read_csv(conf['DATASETS_PATH'] + str(node) + '.csv',
                              na_filter=False)
        df_node = df_node.applymap(str)
        df_node = df_node.applymap(lambda x: utils.tokenize(x)
                                   if isinstance(x, str) else x)
        if type(columns) != list:
            continue
        for col1, col2 in combinations(columns, 2):
            vec_dict = dict()
            rel_name = '%s.%s~%s.%s' % (node, col1, node, col2)
            print('Processing ', rel_name)
            merge1 = pd.merge(df_node,
                              df_vectors,
                              how='inner',
                              left_on=col1,
                              right_on=df_vectors.word)
            merge2 = pd.merge(merge1,
                              df_vectors,
                              how='inner',
                              left_on=col2,
                              right_on=df_vectors.word)
            merging = merge2[[
                col1, col2, 'vector_x', 'vector_y', 'id_vec_x', 'id_vec_y'
            ]]
            merging = merging.fillna('')
            records = merging.to_records(index=False)
            res = list(records)
            complete_query = {"SELECT": "%s,%s" % (col1, col2), "FROM": node}
            for (term1, term2, vec1, vec2, id_vec1, id_vec2) in res:
                key = '%s~%s' % (term1, term2)
                vec_dict[key] = dict()
                vec_dict[key]['ids'] = [int(id_vec1), int(id_vec2)]
            new_group = get_group(rel_name,
                                  'relational',
                                  vec_dict,
                                  query=complete_query)
            if rel_name in result:
                result[rel_name].append(new_group)
            else:
                result[rel_name] = [new_group]
    return result
Ejemplo n.º 4
0
def get_adjacency_vector(size, group_name, index_lookup, conf):
    # Get group elements (elements of column)
    print('Get adjacency vector for group:', group_name)
    table_name, column_name = utils.get_column_data_from_label(group_name, 'column')
    df = pd.read_csv(conf['DATASETS_PATH'] + table_name + '.csv', na_filter=False)
    res = df[column_name]
    res = res.fillna('')
    group_elements = []
    for idx, x in res.items():
        # Modified the following rows to support multi-words per token
        tokenize_x = utils.tokenize(x)
        for val in tokenize_x.split('_'):
            group_elements.append(group_name + '#' + val)
    # Construct vector
    vector = np.zeros(size)
    for element in group_elements:
        i = index_lookup[element]
        vector[i] = 1
    return vector
Ejemplo n.º 5
0
def get_column_groups(df_vectors, graph, terms, conf):
    print("Column relation extraction started:")
    result = dict()
    for node in graph.nodes:
        columns_attr = graph.nodes[node]['columns']
        column_names = columns_attr if type(columns_attr) == list else [
            columns_attr
        ]
        df_node = pd.read_csv(conf['DATASETS_PATH'] + str(node) + '.csv',
                              na_filter=False)
        df_node = df_node.applymap(str)
        df_node = df_node.applymap(lambda x: utils.tokenize(x)
                                   if isinstance(x, str) else x)
        for column_name in column_names:
            print('Process %s.%s ...' % (node, column_name))
            vec_dict_fit = dict()
            vec_dict_inferred = dict()
            merging = pd.merge(df_node,
                               df_vectors,
                               how='left',
                               left_on=column_name,
                               right_on=df_vectors.word)
            merging = merging[[column_name, 'vector', 'id_vec']]
            merging = merging.fillna('')
            records = merging.to_records(index=False)
            term_vecs = list(records)
            for (term, vec_bytes, vec_id) in term_vecs:
                # Modified the following rows to support multi-words per token
                for val in term.split('_'):
                    if vec_bytes != '':
                        vec_dict_fit[val] = dict()
                        vec_dict_fit[val]['vector'] = np.array(
                            vec_bytes.split(), dtype='float32')
                        vec_dict_fit[val]['id'] = int(vec_id)
                    else:
                        if val == '':
                            continue
                        splits = [x.replace('_', '') for x in val.split('_')]
                        i = 1
                        j = 0
                        current = [terms, None, -1]
                        vector = None
                        last_match = (0, None, -1)
                        count = 0
                        while i <= len(splits) or last_match[1] is not None:
                            sub_word = '_'.join(splits[j:i])
                            if sub_word in current[0]:
                                current = current[0][sub_word]
                                if (current[1] != '') and (current[1]
                                                           is not None):
                                    last_match = (i,
                                                  np.array(current[1].split(),
                                                           dtype='float32'),
                                                  current[2])
                            else:
                                if last_match[1] is not None:
                                    if vector is not None:
                                        if conf['TOKENIZATION'] == 'log10':
                                            vector += last_match[1] * np.log10(
                                                last_match[2])
                                            count += np.log10(last_match[2])
                                        else:  # 'simple' or different
                                            vector += last_match[1]
                                            count += 1
                                    else:
                                        if conf['TOKENIZATION'] == 'log10':
                                            vector = last_match[1] * np.log10(
                                                last_match[2])
                                            count += np.log10(last_match[2])
                                        else:  # 'simple' or different
                                            vector = last_match[1]
                                            count += 1
                                    j = last_match[0]
                                    i = j
                                    last_match = (0, None, -1)
                                else:
                                    j += 1
                                    i = j
                                current = [terms, None, -1]
                            i += 1
                        if vector is not None:
                            vector /= count
                            vec_dict_inferred[val] = dict()
                            vec_dict_inferred[val]['vector'] = vector
            result['%s.%s' % (node, column_name)] = [
                get_group('%s.%s' % (node, column_name),
                          'categorial',
                          vec_dict_fit,
                          extended=vec_dict_inferred)
            ]
    return result
Ejemplo n.º 6
0
def get_relation_groups(df_vectors, graph, conf):
    # Assumption: two tables are only direct related by one foreign key relation
    print("Table relation extraction started:")
    result = dict()
    for (node1, node2, attrs) in graph.edges.data():
        table1, table2 = node1, node2
        df_table1 = pd.read_csv(conf['DATASETS_PATH'] + str(table1) + '.csv',
                                na_filter=False)
        df_table1 = df_table1.applymap(str)
        df_table1 = df_table1.applymap(lambda x: utils.tokenize(x)
                                       if isinstance(x, str) else x)
        df_table2 = pd.read_csv(conf['DATASETS_PATH'] + str(table2) + '.csv',
                                na_filter=False)
        df_table2 = df_table2.applymap(str)
        df_table2 = df_table2.applymap(lambda x: utils.tokenize(x)
                                       if isinstance(x, str) else x)
        key_col1, key_col2 = attrs['col1'], attrs['col2']
        columns_attr1 = graph.nodes[node1]['columns']
        column_names1 = columns_attr1 if type(columns_attr1) == list else [
            columns_attr1
        ]
        columns_attr2 = graph.nodes[node2]['columns']
        column_names2 = columns_attr2 if type(columns_attr2) == list else [
            columns_attr2
        ]
        list1_as_set = set(column_names1)
        intersection = list1_as_set.intersection(column_names2)
        intersection_as_list = list(intersection)
        for col1 in column_names1:
            for col2 in column_names2:
                print('Process %s.%s~%s.%s ...' % (node1, col1, node2, col2))
                # Connect source with target
                vec_dict = dict()
                rel_name = '%s.%s~%s.%s' % (node1, col1, node2, col2)
                if attrs['name'] == '-':
                    merge1 = pd.merge(df_table1,
                                      df_table2,
                                      left_on=key_col1,
                                      right_on=key_col2)
                    if (col1 in intersection_as_list) and (
                            col2 in intersection_as_list):
                        merge2 = pd.merge(merge1,
                                          df_vectors,
                                          left_on=col1 + '_x',
                                          right_on=df_vectors.word)
                        merge3 = pd.merge(merge2,
                                          df_vectors,
                                          left_on=col2 + '_y',
                                          right_on=df_vectors.word)
                        merging = merge3[[
                            col1 + '_x', col2 + '_y', 'vector_x', 'vector_y',
                            'id_vec_x', 'id_vec_y'
                        ]]
                    elif (col1 not in intersection_as_list) and (
                            col2 not in intersection_as_list):
                        merge2 = pd.merge(merge1,
                                          df_vectors,
                                          left_on=col1,
                                          right_on=df_vectors.word)
                        merge3 = pd.merge(merge2,
                                          df_vectors,
                                          left_on=col2,
                                          right_on=df_vectors.word)
                        merging = merge3[[
                            col1, col2, 'vector_x', 'vector_y', 'id_vec_x',
                            'id_vec_y'
                        ]]
                    elif (col1 in intersection_as_list) and (
                            col2 not in intersection_as_list):
                        merge2 = pd.merge(merge1,
                                          df_vectors,
                                          left_on=col1 + '_x',
                                          right_on=df_vectors.word)
                        merge3 = pd.merge(merge2,
                                          df_vectors,
                                          left_on=col2,
                                          right_on=df_vectors.word)
                        merging = merge3[[
                            col1 + '_x', col2, 'vector_x', 'vector_y',
                            'id_vec_x', 'id_vec_y'
                        ]]
                    else:
                        merge2 = pd.merge(merge1,
                                          df_vectors,
                                          left_on=col1,
                                          right_on=df_vectors.word)
                        merge3 = pd.merge(merge2,
                                          df_vectors,
                                          left_on=col2 + '_y',
                                          right_on=df_vectors.word)
                        merging = merge3[[
                            col1, col2 + '_y', 'vector_x', 'vector_y',
                            'id_vec_x', 'id_vec_y'
                        ]]
                    # Construct complete query for reconstruction
                    complete_query = {
                        "SELECT": "%s,%s" % (col1, col2),
                        "FROM": table1,
                        "JOIN": table2,
                        "LEFT_ON": key_col1,
                        "RIGHT_ON": key_col2
                    }
                else:
                    pkey_col1 = graph.nodes[node1]['pkey']
                    pkey_col2 = graph.nodes[node2]['pkey']
                    rel_tab_name = attrs['name']
                    df_rel_tab = pd.read_csv(conf['DATASETS_PATH'] +
                                             str(rel_tab_name) + '.csv',
                                             na_filter=False)
                    df_rel_tab = df_rel_tab.applymap(str)
                    df_rel_tab = df_rel_tab.applymap(lambda x: utils.tokenize(
                        x) if isinstance(x, str) else x)
                    merge1 = pd.merge(df_table1,
                                      df_rel_tab,
                                      left_on=pkey_col1,
                                      right_on=key_col1)
                    merge2 = pd.merge(merge1,
                                      df_table2,
                                      left_on=key_col2,
                                      right_on=pkey_col2)
                    merge3 = pd.merge(merge2,
                                      df_vectors,
                                      left_on=col1,
                                      right_on=df_vectors.word)
                    merge4 = pd.merge(merge3,
                                      df_vectors,
                                      left_on=col2,
                                      right_on=df_vectors.word)
                    merging = merge4[[
                        col1, col2, 'vector_x', 'vector_y', 'id_vec_x',
                        'id_vec_y'
                    ]]
                    # Construct complete query for reconstruction
                    complete_query = {
                        "SELECT": "%s,%s" % (col1, col2),
                        "FROM": table1,
                        "JOIN": [rel_tab_name, table2],
                        "LEFT_ON": [pkey_col1, key_col2],
                        "RIGHT_ON": [key_col1, pkey_col2]
                    }
                merging = merging.fillna('')
                records = merging.to_records(index=False)
                res = list(records)
                for (term1, term2, vec1_bytes, vec2_bytes, vec1_id,
                     vec2_id) in res:
                    key = '%s~%s' % (term1, term2)
                    vec_dict[key] = dict()
                    vec_dict[key]['ids'] = [int(vec1_id), int(vec2_id)]
                new_group = get_group(attrs['name'],
                                      'relational',
                                      vec_dict,
                                      query=complete_query)
                if rel_name in result:
                    result[rel_name].append(new_group)
                else:
                    result[rel_name] = [new_group]
    return result