Ejemplo n.º 1
0
# format embeddings
d_linkedin_name_2_embeddings = {}
for i, embedding in enumerate(X):
    d_linkedin_name_2_embeddings[names[i]] = embedding

# save results
path_lib.cache(
    path_lib.get_relative_file_path(
        'runtime', 'processed_input',
        f'd_linkedin_name_2_embeddings_{VERSION}.pkl'),
    d_linkedin_name_2_embeddings)

# load the data
json_path = path_lib.get_relative_file_path(
    'runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
    'd_min_linkedin_name_max_linkedin_name']

# calculate the cosine distance
distances = cdist(X, X, 'cosine')

# get the results with the top k minimal cosine distance
for i in range(len(distances)):
    distances[i, i] = 2
similarities = 1 - np.tanh(distances)
top_k_idx = similarities.argsort()[:, -600:]
top_k_idx = top_k_idx[::-1]

# format data
Ejemplo n.º 2
0
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'neg_rate_{self.__negative_rate}_v2.pkl')
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        tmp = path_lib.load_json(self.__competitor_path)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        names = list(d_linkedin_name_2_linkedin_val.keys())

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        print('loading sentence bert to generate embeddings ...')
        from sentence_transformers import SentenceTransformer
        self.__sentence_bert = SentenceTransformer(
            'bert-large-nli-stsb-mean-tokens')

        for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys():
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(self.__negative_rate):
                # randomly choose negative competitor relationship
                name_2_neg = self.__random_choose(
                    names, name_1, d_min_linkedin_name_max_linkedin_name)
                feature_2_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_2_neg])

                # randomly choose negative competitor relationship
                name_1_neg = self.__random_choose(
                    names, name_2, d_min_linkedin_name_max_linkedin_name)
                feature_1_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_1_neg])

                data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg])
                data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from lib import path_lib
from lib import format_name

# load the json data
json_path = path_lib.get_relative_file_path(
    'runtime', 'competitor_linkedin_dict_format_v4.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
    'd_min_linkedin_name_max_linkedin_name']

# get all the competitor names
names = list(
    map(lambda x: x.split('____'),
        d_min_linkedin_name_max_linkedin_name.keys()))
names_1, names_2 = list(zip(*names))
names = list(set(names_1 + names_2))
path_lib.write_json(
    path_lib.get_relative_file_path('runtime', 'competitor_names.json'), names)

public_csv = pd.read_csv(
    path_lib.get_relative_file_path('QualityControl_2016_result.csv'))
public_data = np.array(public_csv)
columns = list(public_csv.columns)

print('\nexamples of the QualityControl_2016_result')
for i, v in enumerate(columns):
    print(f'{i} : {v} : {public_data[0][i]} : {public_data[1][i]}')
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'similar_version_neg_rate_{self.__negative_rate}_start_{self.__start_ratio}_end_{self.__end_ratio}.pkl'
        )
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        tmp = path_lib.load_json(self.__competitor_path)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        self.__d_linkedin_name_2_embedding = path_lib.load_pkl(
            self.__embedding_path)
        self.__d_linkedin_name_2_similar_names = path_lib.load_json(
            self.__similar_company_path)

        print('splitting dataset ...')
        name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys())
        name_pairs.sort()

        total_pairs = len(name_pairs)
        start_index = int(total_pairs * self.__start_ratio)
        end_index = int(total_pairs * self.__end_ratio)
        name_pairs = name_pairs[start_index:end_index]

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        for min_name_max_name in name_pairs:
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                name_1, d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                name_2, d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(int(self.__negative_rate)):
                # randomly choose negative competitor relationship
                name_2_neg = self.__random_choose(name_1)
                feature_2_neg = self.__choose_features(
                    name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg])
                data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg])

                # randomly choose negative competitor relationship
                name_1_neg = self.__random_choose(name_2)
                feature_1_neg = self.__choose_features(
                    name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg])
                data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2])

            # # add negative competitor relationship
            # for i in range(int(self.__negative_rate * 2)):
            #     if random.randint(0, 1) == 0:
            #         # randomly choose negative competitor relationship
            #         name_2_neg = self.__random_choose(name_1)
            #         feature_2_neg = self.__choose_features(name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg])
            #         data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg])
            #
            #     else:
            #         # randomly choose negative competitor relationship
            #         name_1_neg = self.__random_choose(name_2)
            #         feature_1_neg = self.__choose_features(name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg])
            #         data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data