def __init__(self, negative_rate=1, start_ratio=0.0, end_ratio=0.81, use_cache=True):
        self.__competitor_path = path_lib.get_relative_file_path('runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
        self.__embedding_path = path_lib.get_relative_file_path(
            'runtime', 'processed_input', f'd_linkedin_name_2_embeddings_{VERSION}.pkl')
        self.__negative_rate = negative_rate
        self.__start_ratio = start_ratio
        self.__end_ratio = end_ratio

        self.__data = self.__load(use_cache)
Ejemplo n.º 2
0
def new_line(output=False):
    with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'),
              'ab') as f:
        f.write('\n'.encode('utf-8'))

    if output:
        print('')
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl')
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        with open(self.__competitor_path, 'rb') as f:
            tmp = json.load(f)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']

        data = []

        print('loading sentence bert to generate embeddings ...')
        from sentence_transformers import SentenceTransformer
        self.__sentence_bert = SentenceTransformer(
            'bert-large-nli-stsb-mean-tokens')

        # converting the raw data to features that we need
        for linkedin_name, linkedin_val in d_linkedin_name_2_linkedin_val.items(
        ):
            # get features
            feature = self.__choose_features(linkedin_val)
            data.append([feature, linkedin_name])

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
Ejemplo n.º 4
0
def new_paragraph(output=False):
    string = '\n\n----------------------------------------------\n'
    with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'),
              'ab') as f:
        f.write(string.encode('utf-8'))

    if output:
        print(string)
def statistic():
    print(f'\nstatistic the shared competitors for competitors ... ')

    # record statistics
    shared_competitor_counts = []

    # to remove duplicate statistic
    d_min_name_max_name_2_has_statistic = {}

    length = len(d_name_2_competitors)
    _i = 0
    for _name_1, competitors in d_name_2_competitors.items():
        if _i % 2 == 0:
            progress = float(_i + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')
        _i += 1

        for _j, _name_2 in enumerate(list(competitors)):
            # remove duplicate statistic
            key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}'
            if key in d_min_name_max_name_2_has_statistic:
                continue
            d_min_name_max_name_2_has_statistic[key] = True

            if _name_2 not in d_name_2_competitors:
                shared_competitor_counts.append(0)
                continue

            shared_num = len(competitors.intersection(d_name_2_competitors[_name_2]))
            shared_competitor_counts.append(shared_num)

    logs.new_line()
    logs.add('statistics', 'total count of competitors companies', f'{len(d_name_2_competitors)}', output=True)
    logs.add('statistics', 'mean of shared competitors', f'among competitors: {np.mean(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'std of shared competitors', f'among competitors: {np.std(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'max of shared competitors', f'among competitors: {np.max(shared_competitor_counts)}',
             output=True)
    logs.add('statistics', 'min of shared competitors', f'among competitors: {np.min(shared_competitor_counts)}',
             output=True)

    bins = list(range(0, 53, 1))
    plt.figure(figsize=(18, 8))
    plt.hist(shared_competitor_counts, bins=bins, edgecolor='white')
    plt.title(
        f'histogram for count of shared competitors among competitors',
        fontsize=22)
    plt.xlabel('count of shared competitors for each similar company pair', fontsize=16)
    plt.ylabel('count of company pairs', fontsize=16)
    plt.xticks(bins)
    plt.savefig(path_lib.get_relative_file_path(
        'runtime', 'analysis', 'figures', f'hist_for_shared_competitor_among_competitors.png'),
        dpi=300)
    plt.show()
    plt.close()
Ejemplo n.º 6
0
def add(_id, function, message, _level=LEVEL_MSG, output=False):
    # construct log message
    _time = str(time.strftime('%Y-%m-%d %H:%M:%S'))
    string = f'{_id} : {_level} : {_time} : {function} : {message}\n'

    # write log
    with open(path_lib.get_relative_file_path('log', MODEL, f'{VARIANT}.log'),
              'ab') as f:
        f.write(string.encode('utf-8'))

    # show to console
    if output:
        print(string.strip())
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from lib import path_lib

# load the similarity data
similarity_csv_path = path_lib.get_relative_file_path('runtime', 'result_csv', 'linkedin_similarity.csv')
similarity_csv = pd.read_csv(similarity_csv_path)

similarities = list(similarity_csv['similarity'])
competitor_similarities = list(similarity_csv[similarity_csv['is_competitor'] == 'competitor']['similarity'])

print(f'\n\ncount of all company pairs: {len(similarities)}')
print(f'mean similarity for all company pairs: {np.mean(similarities)}')
print(f'std similarity for all company pairs: {np.std(similarities)}')
print(f'max similarity for all company pairs: {np.max(similarities)}')
print(f'min similarity for all company pairs: {np.min(similarities)}')

print(f'\ncount of competitor relationship: {len(competitor_similarities)}')
print(f'mean similarity for competitors: {np.mean(competitor_similarities)}')
print(f'std similarity for competitors: {np.std(competitor_similarities)}')
print(f'max similarity for competitors: {np.max(competitor_similarities)}')
print(f'min similarity for competitors: {np.min(competitor_similarities)}')

plt.figure(figsize=(14, 8))
plt.subplot(211)
plt.hist(similarities, bins=30, edgecolor='white')
plt.title('histogram for similarity of all company pairs', fontsize=22)
plt.xlabel('similarity', fontsize=16)
plt.ylabel('count of company pairs', fontsize=16)
plt.xticks(list(np.linspace(0.1, 1., 10)), fontsize=14)
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from lib import path_lib

# load the json data
json_path = path_lib.get_relative_file_path(
    'runtime', 'competitor_linkedin_dict_format_v4.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
    'd_min_linkedin_name_max_linkedin_name']

d_name = {}
for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys():
    name_1, name_2 = min_name_max_name.split('____')
    d_name[name_1] = True
    d_name[name_2] = True

# load the similarity data
similarity_csv_path = path_lib.get_relative_file_path(
    'runtime', 'result_csv', 'linkedin_similarity.csv')
similarity_csv = pd.read_csv(similarity_csv_path)
similarity_data = np.array(similarity_csv)

similarity_data = list(
    filter(lambda x: x[0] in d_name or x[2] in d_name, similarity_data))
similarities = list(map(lambda x: x[-2], similarity_data))

competitor_similarities = list(similarity_csv[similarity_csv['is_competitor']
                                              == 'competitor']['similarity'])
Ejemplo n.º 9
0
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from lib import path_lib
from config.path import VERSION

print('\nloading the embeddings ... ')

# load embeddings
pkl_path = path_lib.get_relative_file_path('runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl')
company_embeddings = path_lib.read_cache(pkl_path)
X, names = list(zip(*company_embeddings))
X = np.array(X)
names = np.array(names)

print('\ncalculating the cosine distance ... ')

# calculate the cosine distance
distances = cdist(X, X, 'cosine')

# get the results with the top k minimal cosine distance
similarities = 1 - np.tanh(distances)

print('\nloading the competitor data ... ')

# load linkedin data
json_path = path_lib.get_relative_file_path('runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp['d_min_linkedin_name_max_linkedin_name']
Ejemplo n.º 10
0
import numpy as np
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
from lib import path_lib
from lib import logs

print('\nloading the competitor data ... ')

# load the json data
json_path = path_lib.get_relative_file_path('runtime', 'competitor_linkedin_dict_format_v4.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp['d_min_linkedin_name_max_linkedin_name']

print('\nformatting the competitor data structure ... ')

d_name_2_competitors = {}

name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys())
for min_name_max_name in name_pairs:
    name_1, name_2 = min_name_max_name.split('____')

    if name_1 not in d_name_2_competitors:
        d_name_2_competitors[name_1] = set()
    d_name_2_competitors[name_1].add(name_2)

    if name_2 not in d_name_2_competitors:
        d_name_2_competitors[name_2] = set()
    d_name_2_competitors[name_2].add(name_1)

print('\nloading the embeddings ... ')
 def __init__(self, use_cache=True):
     self.__competitor_path = path_lib.get_relative_file_path(
         'runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
     self.__data = self.__load(use_cache)
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from lib import path_lib
from lib import format_name

# load the json data
json_path = path_lib.get_relative_file_path(
    'runtime', 'competitor_linkedin_dict_format_v4.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
    'd_min_linkedin_name_max_linkedin_name']

# get all the competitor names
names = list(
    map(lambda x: x.split('____'),
        d_min_linkedin_name_max_linkedin_name.keys()))
names_1, names_2 = list(zip(*names))
names = list(set(names_1 + names_2))
path_lib.write_json(
    path_lib.get_relative_file_path('runtime', 'competitor_names.json'), names)

public_csv = pd.read_csv(
    path_lib.get_relative_file_path('QualityControl_2016_result.csv'))
public_data = np.array(public_csv)
columns = list(public_csv.columns)

print('\nexamples of the QualityControl_2016_result')
for i, v in enumerate(columns):
    print(f'{i} : {v} : {public_data[0][i]} : {public_data[1][i]}')
Ejemplo n.º 13
0
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'neg_rate_{self.__negative_rate}_v2.pkl')
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        tmp = path_lib.load_json(self.__competitor_path)
        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        names = list(d_linkedin_name_2_linkedin_val.keys())

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        print('loading sentence bert to generate embeddings ...')
        from sentence_transformers import SentenceTransformer
        self.__sentence_bert = SentenceTransformer(
            'bert-large-nli-stsb-mean-tokens')

        for min_name_max_name in d_min_linkedin_name_max_linkedin_name.keys():
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(self.__negative_rate):
                # randomly choose negative competitor relationship
                name_2_neg = self.__random_choose(
                    names, name_1, d_min_linkedin_name_max_linkedin_name)
                feature_2_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_2_neg])

                # randomly choose negative competitor relationship
                name_1_neg = self.__random_choose(
                    names, name_2, d_min_linkedin_name_max_linkedin_name)
                feature_1_neg = self.__choose_features(
                    d_linkedin_name_2_linkedin_val[name_1_neg])

                data.append([feature_1, feature_2_neg, 0, name_1, name_2_neg])
                data.append([feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
import numpy as np
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
from lib import path_lib
from lib import logs

print('\nloading the competitor data ... ')

# load the json data
json_path = path_lib.get_relative_file_path('runtime', 'competitor_linkedin_dict_format_v4.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp['d_min_linkedin_name_max_linkedin_name']

print('\nformatting the competitor data structure ... ')

d_name_2_competitors = {}

name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys())
for min_name_max_name in name_pairs:
    name_1, name_2 = min_name_max_name.split('____')

    if name_1 not in d_name_2_competitors:
        d_name_2_competitors[name_1] = set()
    d_name_2_competitors[name_1].add(name_2)

    if name_2 not in d_name_2_competitors:
        d_name_2_competitors[name_2] = set()
    d_name_2_competitors[name_2].add(name_1)

print('\nloading the embeddings ... ')
Ejemplo n.º 15
0
def statistic(_top_k_similar):
    _top_k_idx = top_k_idx[:, -_top_k_similar:]
    _top_k_idx = _top_k_idx[::-1]

    print(f'\nstatistic the shared competitors for top {_top_k_similar} similar companies of all Linkedin companies ... ')

    # record statistics
    shared_competitor_counts = []

    # to remove duplicate statistic
    d_min_name_max_name_2_has_statistic = {}

    length = len(names)
    for _i, _name_1 in enumerate(names):

        if _i % 2 == 0:
            progress = float(_i + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')

        similar_names = names[_top_k_idx[_i]]

        for _j, _name_2 in enumerate(similar_names):

            # remove duplicate statistic
            key = f'{min(_name_1, _name_2)}____{max(_name_1, _name_2)}'
            if key in d_min_name_max_name_2_has_statistic:
                continue
            d_min_name_max_name_2_has_statistic[key] = True

            if _name_1 not in d_name_2_competitors or _name_2 not in d_name_2_competitors:
                shared_competitor_counts.append(0)
                continue

            competitor_set_1 = d_name_2_competitors[_name_1]
            competitor_set_2 = d_name_2_competitors[_name_2]

            shared_num = len(competitor_set_1.intersection(competitor_set_2))
            shared_competitor_counts.append(shared_num)

    logs.new_line()
    logs.add('statistics', 'total count of companies', f'{len(names)}', output=True)
    logs.add('statistics', 'mean of shared competitors', f'among top {_top_k_similar} similar companies: {np.mean(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'std of shared competitors', f'among top {_top_k_similar} similar companies: {np.std(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'max of shared competitors', f'among top {_top_k_similar} similar companies: {np.max(shared_competitor_counts)}', output=True)
    logs.add('statistics', 'min of shared competitors', f'among top {_top_k_similar} similar companies: {np.min(shared_competitor_counts)}', output=True)

    num_0 = len(list(filter(lambda x: x == 0, shared_competitor_counts)))
    shared_competitor_counts = list(filter(lambda x: x > 0, shared_competitor_counts))

    plt.figure(figsize=(14, 8))
    plt.hist(shared_competitor_counts, bins=[0.1, 1, 2, 3, 4, 5, 10, 20, 40], edgecolor='white')
    plt.title(
        f'histogram for count of shared competitors among top {_top_k_similar} similar companies of all Linkedin companies\n(spike for ({num_0} zero shared competitors) is removed)',
        fontsize=22)
    plt.xlabel('count of shared competitors for each similar company pair', fontsize=16)
    plt.ylabel('count of company pairs', fontsize=16)
    plt.xticks([0, 1, 2, 3, 4, 5, 10, 20, 40])
    plt.savefig(
        path_lib.get_relative_file_path('runtime', 'analysis', 'figures',
                                        f'hist_for_shared_competitor_among_top_{_top_k_similar}_similar_companies.png'),
        dpi=300)
    plt.show()
    plt.close()
    def __load(self, use_cache):
        """ Load the data as embeddings """

        cache_path = path_lib.get_relative_file_path(
            'runtime', 'input_cache',
            f'neg_rate_{self.__negative_rate}_start_{self.__start_ratio}_end_{self.__end_ratio}.pkl'
        )
        if use_cache and os.path.isfile(cache_path):
            return path_lib.read_cache(cache_path)

        print(f'\nloading data from {self.__competitor_path} ...')
        with open(self.__competitor_path, 'rb') as f:
            tmp = json.load(f)

        d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
        d_min_linkedin_name_max_linkedin_name = tmp[
            'd_min_linkedin_name_max_linkedin_name']

        self.__d_linkedin_name_2_embedding = path_lib.load_pkl(
            self.__embedding_path)

        print('splitting dataset ...')
        name_pairs = list(d_min_linkedin_name_max_linkedin_name.keys())
        name_pairs.sort()

        total_pairs = len(name_pairs)
        start_index = int(total_pairs * self.__start_ratio)
        end_index = int(total_pairs * self.__end_ratio)
        name_pairs = name_pairs[start_index:end_index]

        names = list(d_linkedin_name_2_linkedin_val.keys())

        print(
            'generating the positive and negative competitor relationships ... '
        )

        data = []

        print(
            'generating the positive and negative competitor relationships ... '
        )

        for min_name_max_name in name_pairs:
            name_1, name_2 = min_name_max_name.split('____')

            # get features
            feature_1 = self.__choose_features(
                name_1, d_linkedin_name_2_linkedin_val[name_1])
            feature_2 = self.__choose_features(
                name_2, d_linkedin_name_2_linkedin_val[name_2])

            # add positive competitor relationship
            data.append([feature_1, feature_2, 1, name_1, name_2])

            # add negative competitor relationship
            for i in range(int(self.__negative_rate * 2)):
                if random.randint(0, 1) == 0:
                    # randomly choose negative competitor relationship
                    name_2_neg = self.__random_choose(
                        names, name_1, d_min_linkedin_name_max_linkedin_name)
                    feature_2_neg = self.__choose_features(
                        name_2_neg, d_linkedin_name_2_linkedin_val[name_2_neg])
                    data.append(
                        [feature_1, feature_2_neg, 0, name_1, name_2_neg])

                else:
                    # randomly choose negative competitor relationship
                    name_1_neg = self.__random_choose(
                        names, name_2, d_min_linkedin_name_max_linkedin_name)
                    feature_1_neg = self.__choose_features(
                        name_1_neg, d_linkedin_name_2_linkedin_val[name_1_neg])
                    data.append(
                        [feature_1_neg, feature_2, 0, name_1_neg, name_2])

        print('shuffling the data ...')
        random.shuffle(data)

        print('writing cache ...')
        path_lib.cache(cache_path, data)

        print('finish loading ')
        return data
Ejemplo n.º 17
0
import numpy as np
from scipy.spatial.distance import cdist
from lib import path_lib
from config.path import VERSION

# load embeddings
pkl_path = path_lib.get_relative_file_path(
    'runtime', 'input_cache', f'company_embeddings_{VERSION}.pkl')
company_embeddings = path_lib.read_cache(pkl_path)
X, names = list(zip(*company_embeddings))
X = np.array(X)
names = np.array(names)

# format embeddings
d_linkedin_name_2_embeddings = {}
for i, embedding in enumerate(X):
    d_linkedin_name_2_embeddings[names[i]] = embedding

# save results
path_lib.cache(
    path_lib.get_relative_file_path(
        'runtime', 'processed_input',
        f'd_linkedin_name_2_embeddings_{VERSION}.pkl'),
    d_linkedin_name_2_embeddings)

# load the data
json_path = path_lib.get_relative_file_path(
    'runtime', f'competitor_linkedin_dict_format_{VERSION}.json')
tmp = path_lib.load_json(json_path)
d_linkedin_name_2_linkedin_val = tmp['d_linkedin_name_2_linkedin_val']
d_min_linkedin_name_max_linkedin_name = tmp[
Ejemplo n.º 18
0
        find_string = 'test_evaluation : recall: '
        if find_string in line:
            _recall.append(line[line.index(find_string) + len(find_string):])

        find_string = 'test_evaluation : f1: '
        if find_string in line:
            _f1.append(line[line.index(find_string) + len(find_string):])

    return _top_k, _acc, _precision, _recall, _f1


data = []
log_dir = path_lib.create_dir_in_root('log', 'fixed_mean_sent_emb_similarity')

for file_name in os.listdir(log_dir):
    file_path = os.path.join(log_dir, file_name)

    with open(file_path, 'rb') as f:
        content = f.readlines()

    content = list(map(lambda x: x.decode('utf-8').strip(), content))
    data += list(zip(*parse_log(content)))

data = list(map(list, data))
data.sort()

df = pd.DataFrame(data, columns=['top_k', 'acc', 'precision', 'recall', 'f1'])
df.to_csv(path_lib.get_relative_file_path('runtime', 'result_csv', 'test_similarity.csv'), index=False)

print('\ndone')