Ejemplo n.º 1
0
def make_word_embedding(input_file: Path, word_embedding: str):
    # 加载word embedding
    wv = KeyedVectors.load_word2vec_format(word_embedding,
                                           binary=False,
                                           encoding='utf-8',
                                           unicode_errors='ignore')

    word_set = set()
    # 按词分
    with input_file.open('r') as f:
        for line in tqdm(f):
            json_line = json.loads(line)
            word_set = word_set.union(set(jieba.lcut(json_line['text'])))

    stoi = defaultdict(int)
    itos = defaultdict(str)
    vectors = []
    add_pad_unk(stoi, itos, vectors, wv)
    for idx, word in enumerate(word_set):
        if word in wv.vocab:
            stoi[word] = len(stoi)
            itos[len(itos)] = word
            vectors.append(wv.get_vector(word))
    word_embedding = WordEmbedding(stoi=stoi, itos=itos, vectors=vectors)

    # 按字分
    char_set = set()
    with input_file.open('r') as f:
        for line in tqdm(f):
            json_line = json.loads(line)
            char_set = char_set.union(set(list(json_line['text'])))

    stoi = defaultdict(int)
    itos = defaultdict(str)
    vectors = []
    add_pad_unk(stoi, itos, vectors, wv)
    for idx, char in enumerate(char_set):
        if char in wv.vocab:
            stoi[char] = len(stoi)
            itos[len(itos)] = char
            vectors.append(wv.get_vector(char))

    char_embedding = WordEmbedding(stoi=stoi, itos=itos, vectors=vectors)

    word_embedding_cache = Path(
        '../word_embedding/.cache/medical_word_embedding.pkl').open('wb')
    char_embedding_cache = Path(
        '../word_embedding/.cache/medical_char_embedding.pkl').open('wb')
    pickle.dump(word_embedding, word_embedding_cache)
    pickle.dump(char_embedding, char_embedding_cache)
    word_embedding_cache.close()
    char_embedding_cache.close()
Ejemplo n.º 2
0
    def ocr_feature_generation_h5(self):
        self.node_feature_h5.create_dataset(
            "ocr_token_embeddings", (self.n_images, self.max_n_ocr, 300),
            dtype='float32')
        self.node_feature_h5.create_dataset("ocr_bounding_boxes",
                                            (self.n_images, self.max_n_ocr, 8),
                                            dtype='float32')

        word_embed = WordEmbedding('fasttext', self.word_emb_config)
        for image_index in tqdm(range(self.n_images),
                                unit='image',
                                desc='Ocr feature generation'):
            image_id = self.image_ix_to_id[str(image_index)]
            image_ocr = self.ocr[image_id]
            image_ocr_token_embeddings = np.zeros((self.max_n_ocr, 300),
                                                  dtype='float32')
            image_ocr_bounding_boxes = np.zeros((self.max_n_ocr, 8),
                                                dtype='float32')
            for ocr_index, (ocr_token, bbox) in enumerate(image_ocr.items()):
                image_ocr_token_embeddings[ocr_index] = word_embed(ocr_token)
                image_ocr_bounding_boxes[ocr_index] = np.array(bbox).flatten()
            self.node_feature_h5["ocr_token_embeddings"][image_index] = \
                image_ocr_token_embeddings
            self.node_feature_h5["ocr_bounding_boxes"][image_index] = \
                image_ocr_bounding_boxes
Ejemplo n.º 3
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.bn = nn.BatchNorm1d(args.hidden_size)
        self.word_embed = WordEmbedding(args.vocab_size, 300, .0)
        self.sent_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.sent_embed = DynamicRNN(self.sent_embed)
        self.hist_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.hist_embed = DynamicRNN(self.hist_embed)
        self.bup_att = FIND(2048, 1024, 1024)

        self.q_net = FCNet([1024, 1024])
        self.v_net = FCNet([2048, 1024])
        self.linear = nn.Linear(args.hidden_size * 2, args.hidden_size)

        self.layer_stack = nn.ModuleList([
            REFER(d_model=512,
                  d_inner=1024,
                  n_head=4,
                  d_k=256,
                  d_v=256,
                  dropout=0.2) for _ in range(2)
        ])
Ejemplo n.º 4
0
 def __init__(self, opt):
     self.opt = opt
     print('> training arguments:')
     for arg in vars(opt):
         print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))
     w2v_path = {
         'word': '/data/word2vec/sgns.financial.word',
         'word_bigram': '/data/word2vec/sgns.financial.bigram',
         'word_char': '/data/word2vec/sgns.financial.char',
         'word_char_bigram': '/data/word2vec/sgns.financial.bigram-char',
         'char': '/data/word2vec/sgns.char'
     }
     embed = WordEmbedding(
         os.path.dirname(__file__) + w2v_path[opt.vector_level])
     c2v_path = '/data/word2vec/sgns.char'
     charEmbed = WordEmbedding(os.path.dirname(__file__) + c2v_path)
     train_set = TextDataSet(os.path.dirname(__file__) +
                             '/data/test_data.csv',
                             embed,
                             charEmbed,
                             max_seq_len=opt.max_seq_len,
                             vector_level=opt.vector_level,
                             train=True)
     test_set = TextDataSet(os.path.dirname(__file__) +
                            '/data/test_data.csv',
                            embed,
                            charEmbed,
                            max_seq_len=opt.max_seq_len,
                            vector_level=opt.vector_level,
                            test=True)
     self.train_data_loader = DataLoader(dataset=train_set,
                                         batch_size=opt.batch_size,
                                         shuffle=True)
     self.test_data_loader = DataLoader(dataset=test_set,
                                        batch_size=opt.batch_size,
                                        shuffle=True)
     self.writer = SummaryWriter(log_dir=opt.logdir)
     if opt.model_name in ['tmp', 'lstm_cnn']:
         self.model = opt.model_class(embed.m, charEmbed.m,
                                      opt).to(opt.device)
     else:
         self.model = opt.model_class(embed.m, opt).to(opt.device)
     self.reset_parameters()
Ejemplo n.º 5
0
 def object_name_embedding_generation(self):
     word_embed = WordEmbedding('glove', self.word_emb_config)
     for image_index in tqdm(range(self.n_images),
                             unit='image',
                             desc='Object name embedding generation'):
         image_id = self.image_ix_to_id[str(image_index)]
         image_nodes = self.nodes[image_id]
         n_objects = self.image_n_objects[image_index]
         image_object_name_embeddings = np.zeros((n_objects, 300),
                                                 dtype='float32')
         for object_index in range(n_objects):
             image_object_name_embeddings[object_index] = word_embed(
                 image_nodes[object_index])
         with open(
                 os.path.join(self.dir['object_name_embeddings'],
                              '{}.p'.format(image_index)), 'wb') as f:
             pickle.dump(image_object_name_embeddings, f)
Ejemplo n.º 6
0
 def object_name_embedding_generation_h5(self):
     self.node_feature_h5.create_dataset("object_name_embeddings",
                                         (self.n_images, 36, 300),
                                         dtype='float32')
     word_embed = WordEmbedding('glove', self.word_emb_config)
     for image_index in tqdm(range(self.n_images),
                             unit='image',
                             desc='Object name embedding generation'):
         image_id = self.image_ix_to_id[str(image_index)]
         image_nodes = self.nodes[image_id]
         n_objects = self.image_n_objects[image_index]
         image_object_name_embeddings = np.zeros((36, 300), dtype='float32')
         for object_index in range(n_objects):
             image_object_name_embeddings[object_index] = word_embed(
                 image_nodes[object_index])
         self.node_feature_h5['object_name_embeddings'][image_index] = \
             image_object_name_embeddings
Ejemplo n.º 7
0
    def ocr_feature_generation(self):
        word_embed = WordEmbedding('fasttext', self.word_emb_config)

        for image_index in tqdm(range(self.n_images),
                                unit='image',
                                desc='Ocr feature generation'):
            image_id = self.image_ix_to_id[str(image_index)]
            image_ocr = self.ocr[image_id]
            n_ocr = len(image_ocr)
            image_ocr_token_embeddings = np.zeros((n_ocr, 300),
                                                  dtype='float32')
            image_ocr_bounding_boxes = np.zeros((n_ocr, 8), dtype='float32')
            for ocr_index, (ocr_token, bbox) in enumerate(image_ocr.items()):
                image_ocr_token_embeddings[ocr_index] = word_embed(ocr_token)
                image_ocr_bounding_boxes[ocr_index] = np.array(bbox).flatten()
            with open(
                    os.path.join(self.dir['ocr_token_embeddings'],
                                 '{}.p'.format(image_index)), 'wb') as f:
                pickle.dump(image_ocr_token_embeddings, f)
            with open(
                    os.path.join(self.dir['ocr_bounding_boxes'],
                                 '{}.p'.format(image_index)), 'wb') as f:
                pickle.dump(image_ocr_bounding_boxes, f)
Ejemplo n.º 8
0
    def __init__(self, opt):
        self.opt = opt

        w2v_path = {
            'word': '/data/word2vec/sgns.financial.word',
            'word_bigram': '/data/word2vec/sgns.financial.bigram',
            'char': '/data/word2vec/sgns.financial.char',
            'char_bigram': '/data/word2vec/sgns.financial.bigram-char'
        }
        embed = WordEmbedding(os.path.dirname(__file__) +
                              w2v_path[opt.vector_level],
                              initializer='avg')
        data_set = TextDataSet(os.path.dirname(__file__) +
                               '/data/single_2500_temp.csv',
                               embed,
                               max_seq_len=opt.max_seq_len,
                               vector_level=opt.vector_level)
        self.data_loader = DataLoader(dataset=data_set, batch_size=512)

        self.model = opt.model_class(embed.m, opt).to(opt.device)
        self.model.load_state_dict(
            torch.load(
                os.path.dirname(__file__) + '/best/' + opt.model_name +
                '_best.pkl')['state_dict'])
Ejemplo n.º 9
0
 def __init__(self, config):
     self.config = config
     self.word_embedding = WordEmbedding.WordEmbedding(config)
Ejemplo n.º 10
0
    def __init__(self,
                 path_to_node_info='data/node_information.csv',
                 path_to_training_set='data/training_set.txt',
                 path_to_test_set='data/testing_set.txt',
                 path_to_wv_model=None,
                 load_graph_dict=True):
        """The ComputeFeatures class enables the user to compute multiple features.
        Consider using the static `import_from_file` static function so as not to have to compute the dataframe every single time.
        :param path_to_node_info: the `node_information.csv` file from the kaggle competition
        :param path_to_training_set: the `training_set`
        :param path_to_wv_model: the path to the wv model, if you do not want to train it from scratch."""

        self.handled_variables = [
            "publication_2", "adam_coeff", "overlapping_words_in_title",
            "number_of_common_authors", "difference_of_years",
            "affinity_between_authors", "identical_journal", "l2_distance",
            "cosine_distance_tfid", "l2_distance_between_titles",
            "common_neighbors", "clustering_coeff", "betweenness", "closeness",
            "degree", "eigenvector", "jaccard_coeff", "shortest_path",
            "pagerank", "community", "lp_within_inter_cluster",
            "lp_ra_index_soundarajan", "lp_cn_soundarajan",
            "lp_preferential_attachment", "lp_resource_allocation_index"
        ]

        print("Loading node information...")
        self.node_information = pd.read_csv(
            path_to_node_info,
            names=['id', 'year', 'title', 'author', 'journal', 'abstract'])
        self.node_information = self.node_information.set_index('id')

        print("Loading train array...")
        self.train_array = np.loadtxt(path_to_training_set, dtype=int)
        self.nb_training_samples = self.train_array.shape[0]

        print("Loading test array...")
        self.test_array = np.loadtxt(path_to_test_set, dtype=int)
        self.nb_testing_samples = self.test_array.shape[0]

        # for tokenization
        print("Loading stemmer and stop words...")
        nltk.download('punkt')
        self.stemmer = nltk.stem.PorterStemmer()
        nltk.download('stopwords')
        self.stpwds = set(nltk.corpus.stopwords.words("english"))

        print("TfidVectorizer...")
        training_words = list(self.node_information['abstract'])
        vectorizer = TfidfVectorizer(analyzer='word',
                                     ngram_range=(1, 3),
                                     min_df=0,
                                     stop_words="english")
        features_tfid = vectorizer.fit_transform(training_words)
        self.node_information["wv_tfid"] = pd.Series(
            [x for x in features_tfid])

        print("Creating word embeddings...")
        self.wv = WordEmbedding(self.stemmer, self.stpwds)
        if path_to_wv_model is None:
            print("Training wv model with standard params...")
            self.wv.train_model(self.node_information)
        else:
            print("Loading wv model from %s" % path_to_wv_model)
            self.wv.load_model(path_to_wv_model)

        # Create publication column
        print("Creating publication column...")
        self.node_information['publication'] = self.node_information.apply(
            lambda row: [], axis=1)
        for t in self.train_array:
            if t[2] == 1:
                self.node_information.loc[t[0], 'publication'].append(t[1])
                self.node_information.loc[t[1], 'publication'].append(t[0])

        # Create publication_II column
        print("Creating publication II column...")
        self.node_information['publication_2'] = self.node_information.apply(
            lambda row: list_of_publication_2(row, self.node_information),
            axis=1)

        # Authors dict
        print("Creating authors dictionary...")
        authors_list = []
        self.node_information['author'].apply(
            lambda row: stack_lists(row, authors_list))
        authors_list = [
            auth for auth in authors_list
            if auth not in ['', '&', "(", ")"] and len(auth) > 2
        ]
        self.authors_dict = dict(
            (auth, []) for auth in np.unique(authors_list))
        del authors_list
        self.node_information['author'].apply(
            lambda row: put_authors_in_dict(row, self.authors_dict))
        for k in self.authors_dict.keys():
            while k in self.authors_dict[k]:
                self.authors_dict[k].remove(k)
            while '' in self.authors_dict[k]:
                self.authors_dict[k].remove('')

        # Feature vector for the abstract
        print("Making feature vectors for the abstract...")
        self.node_information['wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['abstract'], self.wv.model
                                            ),
            axis=1)

        print("Making feature vectors for the title...")
        self.node_information['title_wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['title'], self.wv.model),
            axis=1)

        # Graph
        print("Making graph structure...")
        self.graph_structure = GraphStructure(self, load_graph_dict)

        self.abstract_feature_model = None
Ejemplo n.º 11
0
class ComputeFeatures:
    def __init__(self,
                 path_to_node_info='data/node_information.csv',
                 path_to_training_set='data/training_set.txt',
                 path_to_test_set='data/testing_set.txt',
                 path_to_wv_model=None,
                 load_graph_dict=True):
        """The ComputeFeatures class enables the user to compute multiple features.
        Consider using the static `import_from_file` static function so as not to have to compute the dataframe every single time.
        :param path_to_node_info: the `node_information.csv` file from the kaggle competition
        :param path_to_training_set: the `training_set`
        :param path_to_wv_model: the path to the wv model, if you do not want to train it from scratch."""

        self.handled_variables = [
            "publication_2", "adam_coeff", "overlapping_words_in_title",
            "number_of_common_authors", "difference_of_years",
            "affinity_between_authors", "identical_journal", "l2_distance",
            "cosine_distance_tfid", "l2_distance_between_titles",
            "common_neighbors", "clustering_coeff", "betweenness", "closeness",
            "degree", "eigenvector", "jaccard_coeff", "shortest_path",
            "pagerank", "community", "lp_within_inter_cluster",
            "lp_ra_index_soundarajan", "lp_cn_soundarajan",
            "lp_preferential_attachment", "lp_resource_allocation_index"
        ]

        print("Loading node information...")
        self.node_information = pd.read_csv(
            path_to_node_info,
            names=['id', 'year', 'title', 'author', 'journal', 'abstract'])
        self.node_information = self.node_information.set_index('id')

        print("Loading train array...")
        self.train_array = np.loadtxt(path_to_training_set, dtype=int)
        self.nb_training_samples = self.train_array.shape[0]

        print("Loading test array...")
        self.test_array = np.loadtxt(path_to_test_set, dtype=int)
        self.nb_testing_samples = self.test_array.shape[0]

        # for tokenization
        print("Loading stemmer and stop words...")
        nltk.download('punkt')
        self.stemmer = nltk.stem.PorterStemmer()
        nltk.download('stopwords')
        self.stpwds = set(nltk.corpus.stopwords.words("english"))

        print("TfidVectorizer...")
        training_words = list(self.node_information['abstract'])
        vectorizer = TfidfVectorizer(analyzer='word',
                                     ngram_range=(1, 3),
                                     min_df=0,
                                     stop_words="english")
        features_tfid = vectorizer.fit_transform(training_words)
        self.node_information["wv_tfid"] = pd.Series(
            [x for x in features_tfid])

        print("Creating word embeddings...")
        self.wv = WordEmbedding(self.stemmer, self.stpwds)
        if path_to_wv_model is None:
            print("Training wv model with standard params...")
            self.wv.train_model(self.node_information)
        else:
            print("Loading wv model from %s" % path_to_wv_model)
            self.wv.load_model(path_to_wv_model)

        # Create publication column
        print("Creating publication column...")
        self.node_information['publication'] = self.node_information.apply(
            lambda row: [], axis=1)
        for t in self.train_array:
            if t[2] == 1:
                self.node_information.loc[t[0], 'publication'].append(t[1])
                self.node_information.loc[t[1], 'publication'].append(t[0])

        # Create publication_II column
        print("Creating publication II column...")
        self.node_information['publication_2'] = self.node_information.apply(
            lambda row: list_of_publication_2(row, self.node_information),
            axis=1)

        # Authors dict
        print("Creating authors dictionary...")
        authors_list = []
        self.node_information['author'].apply(
            lambda row: stack_lists(row, authors_list))
        authors_list = [
            auth for auth in authors_list
            if auth not in ['', '&', "(", ")"] and len(auth) > 2
        ]
        self.authors_dict = dict(
            (auth, []) for auth in np.unique(authors_list))
        del authors_list
        self.node_information['author'].apply(
            lambda row: put_authors_in_dict(row, self.authors_dict))
        for k in self.authors_dict.keys():
            while k in self.authors_dict[k]:
                self.authors_dict[k].remove(k)
            while '' in self.authors_dict[k]:
                self.authors_dict[k].remove('')

        # Feature vector for the abstract
        print("Making feature vectors for the abstract...")
        self.node_information['wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['abstract'], self.wv.model
                                            ),
            axis=1)

        print("Making feature vectors for the title...")
        self.node_information['title_wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['title'], self.wv.model),
            axis=1)

        # Graph
        print("Making graph structure...")
        self.graph_structure = GraphStructure(self, load_graph_dict)

        self.abstract_feature_model = None

    @staticmethod
    def import_from_file(path_to_pickle):
        try:
            with open(path_to_pickle, "rb") as f:
                print("Loading ComputeFeatures object from file %s" %
                      path_to_pickle)
                return pickle.load(f)
        except FileNotFoundError:
            print("File not found ! Check '.obj' extension.")
            return -1

    def save_in_file(self, path_to_file):
        with open(path_to_file, "wb") as f:
            pickle.dump(self, f)

    def compute_multiple_variables(self,
                                   iter_of_variables,
                                   train: bool,
                                   scale: bool,
                                   load=True,
                                   save=True):
        if iter_of_variables == "all":
            iter_of_variables = self.handled_variables
        else:
            for var in iter_of_variables:
                assert var in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % (
                    var, str(self.handled_variables))
        if train:
            result = np.zeros(shape=(self.nb_training_samples,
                                     len(iter_of_variables)))
        else:
            result = np.zeros(shape=(self.nb_testing_samples,
                                     len(iter_of_variables)))
        for i in range(len(iter_of_variables)):
            result[:, i] = np.transpose(
                self.compute_variable(iter_of_variables[i],
                                      train=train,
                                      load=load,
                                      save=save))
        if scale:
            result = preprocessing.scale(result)
        for i in range(len(iter_of_variables)):
            if np.all(result[:, i] == 0):
                print("WARNING: %i th column is void ! (%s)" %
                      (i, iter_of_variables[i]))
        return result

    def compute_variable(self,
                         variable_name,
                         train: bool,
                         load=True,
                         path_to_file=None,
                         save=True):

        assert variable_name in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % (
            variable_name, str(self.handled_variables))

        if load and train:
            if path_to_file is None and os.path.isfile(
                    "variables/%s.npy" % variable_name):
                print("Loading STANDARD %s file!" % variable_name)
                result = np.load("variables/%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved %s in `variables` folder." %
                  variable_name)

        if load and not train:
            if path_to_file is None and os.path.isfile(
                    "variables/TEST_%s.npy" % variable_name):
                print("Loading STANDARD TEST_%s file!" % variable_name)
                result = np.load("variables/TEST_%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved TEST_%s in `variables` folder." %
                  variable_name)

        print("Starting computation of %s..." % variable_name)
        t1 = time()
        gd = self.graph_structure.graph_dicts  # "graph_dictionaries
        if train:
            nb_of_samples = self.nb_training_samples
        else:
            nb_of_samples = self.nb_testing_samples
        result = np.zeros(shape=nb_of_samples)
        for i in range(nb_of_samples):
            if train:
                t = self.train_array[i]
            else:
                t = self.test_array[i]
            if variable_name == "publication_2":
                result[i] = np.log(
                    len(
                        set(self.node_information.loc[t[0], "publication_2"])
                        & set(self.node_information.loc[t[1],
                                                        "publication_2"])) + 1)
            elif variable_name == "adam_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                else:
                    result[i] = \
                        next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2]
            elif variable_name == "overlapping_words_in_title":
                result[i] = compute_intersection(
                    self.node_information.loc[t[0], "title"],
                    self.node_information.loc[t[1], "title"], self.stemmer,
                    self.stpwds)
            elif variable_name == "number_of_common_authors":
                result[i] = nbr_common_authors(
                    self.node_information.loc[t[0], "author"],
                    self.node_information.loc[t[1], "author"])

            elif variable_name == "difference_of_years":
                result[i] = abs(self.node_information.loc[t[0], 'year'] -
                                self.node_information.loc[t[1], 'year'])

            elif variable_name == "affinity_between_authors":
                result[i] = compute_affinity_between_authors(
                    self.node_information.loc[t[0], 'author'],
                    self.node_information.loc[t[1],
                                              'author'], self.authors_dict)
            elif variable_name == "identical_journal":
                result[i] = np.int(self.node_information.loc[t[0], 'journal']
                                   == self.node_information.loc[t[1],
                                                                'journal'])

            elif variable_name == "l2_distance":
                result[i] = np.linalg.norm(
                    self.node_information.loc[t[0], 'wv'] -
                    self.node_information.loc[t[1], 'wv'])

            elif variable_name == "cosine_distance_tfid":
                v1 = self.node_information.loc[t[0], "wv_tfid"]
                v2 = self.node_information.loc[t[1], "wv_tfid"]
                try:
                    b1 = np.isnan(v1)
                except TypeError:
                    b1 = False
                try:
                    b2 = np.isnan(v2)
                except TypeError:
                    b2 = False
                if not b1 and not b2:
                    result[i] = cosine_similarity(v1, v2)
                else:
                    result[i] = 0

            elif variable_name == "l2_distance_between_titles":
                dst = np.linalg.norm(
                    self.node_information.loc[t[0], 'title_wv'] -
                    self.node_information.loc[t[1], 'title_wv'])
                if np.isnan(dst):
                    result[i] = 0
                else:
                    result[i] = dst

            # elif variable_name == "cosine_distance_between_titles":
            #     result[i] = cosine_distances(
            #         np.nan_to_num(self.node_information.loc[t[0], 'title_wv']).reshape(-1, 1) - (self.node_information.loc[t[1], 'title_wv']).reshape(-1, 1)
            #     )[0][0]

            elif variable_name == "common_neighbors":
                result[i] = len(
                    sorted(
                        nx.common_neighbors(self.graph_structure.g, t[0],
                                            t[1])))

            elif variable_name == "clustering_coeff":
                result[i] = gd["clustering_coeff"][
                    t[0]] * gd["clustering_coeff"][t[1]]

            elif variable_name == "betweenness":
                result[i] = gd["betweenness"][t[0]] * gd["betweenness"][t[1]]

            elif variable_name == "closeness":
                result[i] = gd["closeness"][t[0]] * gd["closeness"][t[1]]

            elif variable_name == "degree":
                result[i] = gd["degree"][t[0]] * gd["degree"][t[1]]

            elif variable_name == "eigenvector":
                result[i] = gd["eigenvector"][t[0]] * gd["eigenvector"][t[1]]

            elif variable_name == "jaccard_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                else:
                    result[i] = next(
                        nx.jaccard_coefficient(self.graph_structure.g,
                                               [(t[0], t[1])]))[2]
            elif variable_name == "shortest_path":
                if train:
                    if t[2] == 1:
                        assert self.graph_structure.g.has_edge(
                            t[0], t[1]
                        ), "There's a problem with the structure of the graph for id %i and %i" % (
                            t[0], t[1])
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                else:
                    try:
                        result[
                            i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                self.graph_structure.g, t[0], t[1])
                    except nx.NetworkXNoPath:
                        result[i] = 0

            elif variable_name == "pagerank":
                result[i] = gd["pagerank"][t[0]] * gd["pagerank"][t[1]]

            elif variable_name == "community":
                if self.graph_structure.partition[
                        t[0]] == self.graph_structure.partition[t[1]]:
                    result[i] = 1
                else:
                    result[i] = 0

            elif variable_name == "lp_resource_allocation_index":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.resource_allocation_index(self.graph_structure.g,
                                                     [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_preferential_attachment":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.preferential_attachment(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_cn_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_ra_index_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.ra_index_soundarajan_hopcroft(
                            self.graph_structure.g, [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_within_inter_cluster":

                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.within_inter_cluster(self.graph_structure.g,
                                                [(t[0], t[1])]))[0][2]

        print("Did %s column in %5.1fs" % (variable_name, time() - t1))
        if save and train:
            print("Saved variable %s in `variables` directory." %
                  variable_name)
            np.save("variables/" + variable_name, result)
        if save and not train:
            np.save("variables/TEST_" + variable_name, result)
            print("Saved variable TEST_%s in `variables` directory." %
                  variable_name)
        if np.isnan(result).shape[0] >= 1:
            print("Careful, you have nan values !")
            result[np.isnan(result)] = 0
        return result