def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl')
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        with open(self.__competitor_path, 'rb') as f:
            tmp = json.load(f)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']

        data = []

        print('loading sentence bert to generate embeddings ...')
        from sentence_transformers import SentenceTransformer
        self.__sentence_bert = SentenceTransformer(
            'bert-large-nli-stsb-mean-tokens')

        # converting the raw data to features that we need
        for linkedin_name, linkedin_val in d_linkedin_name_2_linkedin_val.items(
        ):
            # get features
            feature = self.__choose_features(linkedin_val)
            data.append([feature, linkedin_name])

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
Exemple #2
0
import numpy as np
from scipy.spatial.distance import cdist
from lib import path_lib
from config.path import VERSION

# load embeddings
pkl_path = path_lib.get_relative_file_path(
    'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl')
company_embeddings = path_lib.read_cache(pkl_path)
X, names = list(zip(*company_embeddings))
X = np.array(X)
names = np.array(names)

# format embeddings
d_linkedin_name_2_embeddings = {}
for i, embedding in enumerate(X):
    d_linkedin_name_2_embeddings[names[i]] = embedding

# save results
path_lib.cache(
    path_lib.get_relative_file_path(
        'runtime', 'processed_input',
        f'd_linkedin_name_2_embeddings_{VERSION}.pkl'),
    d_linkedin_name_2_embeddings)

# load the data
json_path = path_lib.get_relative_file_path(
    'runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'neg_rate_{self.__negative_rate}_start_{self.__start_ratio}_end_{self.__end_ratio}.pkl'
        )
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        with open(self.__competitor_path, 'rb') as f:
            tmp = json.load(f)

        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        self.__d_linkedin_name_2_embedding = path_lib.load_pkl(
            self.__embedding_path)

        print('splitting dataset ...')
        name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys())
        name_pairs.sort()

        total_pairs = len(name_pairs)
        start_index = int(total_pairs * self.__start_ratio)
        end_index = int(total_pairs * self.__end_ratio)
        name_pairs = name_pairs[start_index:end_index]

        names = list(d_linkedin_name_2_linkedin_val.keys())

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        print(
            'generating the positive and negative competitor relationships ... '
        )

        for min_name_max_name in name_pairs:
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                name_1, d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                name_2, d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(int(self.__negative_rate * 2)):
                if random.randint(0, 1) == 0:
                    # randomly choose negative competitor relationship
                    name_2_neg = self.__random_choose(
                        names, name_1, d_min_linkedin_name_max_linkedin_name)
                    feature_2_neg = self.__choose_features(
                        name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg])
                    data.append(
                        [feature_1, feature_2_neg, 0, name_1, name_2_neg])

                else:
                    # randomly choose negative competitor relationship
                    name_1_neg = self.__random_choose(
                        names, name_2, d_min_linkedin_name_max_linkedin_name)
                    feature_1_neg = self.__choose_features(
                        name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg])
                    data.append(
                        [feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
Exemple #4
0
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'neg_rate_{self.__negative_rate}_v2.pkl')
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        tmp = path_lib.load_json(self.__competitor_path)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        names = list(d_linkedin_name_2_linkedin_val.keys())

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        print('loading sentence bert to generate embeddings ...')
        from sentence_transformers import SentenceTransformer
        self.__sentence_bert = SentenceTransformer(
            'bert-large-nli-stsb-mean-tokens')

        for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys():
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(self.__negative_rate):
                # randomly choose negative competitor relationship
                name_2_neg = self.__random_choose(
                    names, name_1, d_min_linkedin_name_max_linkedin_name)
                feature_2_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_2_neg])

                # randomly choose negative competitor relationship
                name_1_neg = self.__random_choose(
                    names, name_2, d_min_linkedin_name_max_linkedin_name)
                feature_1_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_1_neg])

                data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg])
                data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data