def execute(self):
        logging.info("started extracting word_embeddings feature generator:")
        for counter, target_author_word_embeddings_dict in enumerate(
                self._targeted_author_word_embeddings):
            targeted_table = target_author_word_embeddings_dict["table_name"]
            targeted_field_name = target_author_word_embeddings_dict[
                "targeted_field_name"]
            targeted_word_embedding_type = target_author_word_embeddings_dict[
                "word_embedding_type"]

            targeted_word_embeddings_combination = targeted_table + "_" + targeted_field_name + "_" + targeted_word_embedding_type
            logging.info("currently extracting features of " +
                         targeted_word_embeddings_combination + ": " +
                         str(counter + 1) + " out of " +
                         str(len(self._targeted_author_word_embeddings)))

            author_guid_word_embeding_dict = self.load_author_guid_word_embedding_dict(
                targeted_field_name, targeted_table,
                targeted_word_embedding_type)
            Vector_Operations.create_features_from_word_embedding_dict(
                author_guid_word_embeding_dict, targeted_table,
                targeted_field_name, targeted_word_embedding_type,
                self._word_embedding_table_name, self._window_start,
                self._window_end, self._db, self._max_objects_without_saving,
                self.__class__.__name__ + '_')
    def execute(self):
        logging.info("started extracting word_embbeddings feature generator:")
        counter = 0
        authors_features = []
        for target_author_word_embeddings_dict in self._targeted_author_word_embeddings:
            counter += 1
            targeted_table = target_author_word_embeddings_dict["table_name"]
            targeted_field_name = target_author_word_embeddings_dict[
                "targeted_field_name"]
            targeted_word_embedding_type = target_author_word_embeddings_dict[
                "word_embedding_type"]

            targeted_word_embeddings_combination = targeted_table + "_" + targeted_field_name + "_" + targeted_word_embedding_type
            logging.info("currently extracting features of " +
                         targeted_word_embeddings_combination + ": " +
                         str(counter) + " out of " +
                         str(len(self._targeted_author_word_embeddings)))

            author_guid_word_embeding_dict = self._db.get_author_guid_word_embedding_vector_dict(
                targeted_table, targeted_field_name,
                targeted_word_embedding_type)
            Vector_Operations.create_features_from_word_embedding_dict(
                author_guid_word_embeding_dict, targeted_table,
                targeted_field_name, targeted_word_embedding_type,
                self._window_start, self._window_end, self._db,
                self._max_objects_without_saving)
Esempio n. 3
0
    def execute(self):
        i = 0
        for connection in self._connection_types:
            i += 1

            first_field = connection[0]
            first_table_name = first_field["table_name"]
            first_targeted_field_name = first_field["targeted_field_name"]
            first_word_embedding_type = first_field["word_embedding_type"]

            second_field = connection[1]
            second_table_name = second_field["table_name"]
            second_targeted_field_name = second_field["targeted_field_name"]
            second_word_embedding_type = second_field["word_embedding_type"]

            print(
                '\r {0}/{1} Current connection:{2}_{3}_{4}-{5}_{6}_{7}'.format(
                    i, len(self._connection_types), first_table_name,
                    first_targeted_field_name, first_word_embedding_type,
                    second_table_name, second_targeted_field_name,
                    second_word_embedding_type),
                end='')

            first_author_guid_word_embedding_vector_dict = self._db.get_author_guid_word_embedding_vector_dict(
                first_table_name, first_targeted_field_name,
                first_word_embedding_type)
            second_author_guid_word_embedding_vector_dict = self._db.get_author_guid_word_embedding_vector_dict(
                second_table_name, second_targeted_field_name,
                second_word_embedding_type)

            for function in self._similarity_functions:
                if function == "subtruct_and_split":
                    authors_features = Vector_Operations.create_subtruction_dimension_features_from_authors_dict(
                        first_author_guid_word_embedding_vector_dict,
                        second_author_guid_word_embedding_vector_dict,
                        first_table_name, first_targeted_field_name,
                        first_word_embedding_type, second_table_name,
                        second_targeted_field_name, second_word_embedding_type,
                        self._window_start, self._window_end)
                else:
                    authors_features = Vector_Operations.create_authors_feature_from_two_vectors(
                        function, first_author_guid_word_embedding_vector_dict,
                        second_author_guid_word_embedding_vector_dict,
                        first_table_name, first_targeted_field_name,
                        first_word_embedding_type, second_table_name,
                        second_targeted_field_name, second_word_embedding_type,
                        self._window_start, self._window_end)

                self.insert_author_features_to_db(
                    authors_features)  # create in batches
    def execute(self, window_start=None):
        self._word_vector_dict = self._db.get_word_vector_dictionary(self._table_name)

        for target1, target2 in self._pairs_targets:
            features = []
            target1_tuples = self._get_records_by_target_dict(target1)
            target2_tuples = self._get_records_by_target_dict(target2)
            for id in target1_tuples:
                dif_set1, dif_set2 = self.get_word_differences(id, target1_tuples, target2_tuples)

                for aggregation_function in self._aggregation_functions:
                    try:
                        dif1_word_embedding, dif2_word_embedding, subtraction_vec = self._get_differential_vectors(
                            aggregation_function, dif_set1, dif_set2)

                        feature_name = self._get_feature_names(target1, target2, aggregation_function)

                        features = features + Vector_Operations.create_author_feature_for_each_dimention(
                            subtraction_vec, feature_name, id, self._window_start, self._window_end,
                            self.__class__.__name__ + '_')
                        features = features + self.create_distance_features(id, aggregation_function,
                                                                            dif1_word_embedding,
                                                                            dif2_word_embedding, target1, target2, self.__class__.__name__ + '_')
                    except Exception as e1:
                        logging.info(e1)
            self._db.add_author_features(features)
    def create_distance_features(self, author_id, aggregation_function, word_embedding_vector1, dif2_word_embedding,
                                 target1, target2, prefix=u''):
        distance_features = []
        for distance_function in self._distance_functions:
            feature_name = prefix + u'differential_' + u"distance_function_" + distance_function + '_' + target1[
                'table_name'] + "_" + target1['targeted_field_name'] + "_" + str(aggregation_function) + "_TO_" \
                           + target2['table_name'] + "_" + target2['targeted_field_name'] + "_" + str(
                aggregation_function)

            attribute_value = Vector_Operations.oparate_on_two_vectors(commons, distance_function,
                                                                       word_embedding_vector1,
                                                                       dif2_word_embedding)
            feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id,
                                                                 attribute_value,
                                                                 self._window_start,
                                                                 self._window_end)
            distance_features.append(feature)
        return distance_features
Esempio n. 6
0
    def execute(self, window_start=None):
        for targeted_fields_dict in self._targeted_fields_for_embedding:
            source_id_target_elements_dict = self._get_source_id_target_elements(
                targeted_fields_dict)
            source_id_document_tuples = self._create_documents(
                source_id_target_elements_dict)

            msg = "\rStarting training doc2vec"
            print(msg, end='')
            model = self._train_doc2vec_model(source_id_document_tuples)

            msg = "\rFinishing training doc2vec"
            print(msg, end='')

            targeted_table = targeted_fields_dict['source']['table_name']
            targeted_field_name = targeted_fields_dict['destination']['table_name'] + "-" \
                                  + targeted_fields_dict['destination']['target_field']

            model_type = "{0}_dimensions_{1}_window_size".format(
                self._num_of_dimensions, self._window_size_doc2vec)

            source_ids = source_id_target_elements_dict.keys()
            counter = 0
            authors_features = []
            for i, source_id in enumerate(source_ids):
                msg = "\rExtracting doc2vec features: {0}/{1}:{2}".format(
                    i, len(source_ids), source_id)
                print(msg, end="")
                counter += 1
                if counter % self._max_objects_without_saving == 0:
                    self._db.add_author_features(authors_features)
                    self._db.session.commit()

                source_id_vector = model[source_id]
                feature_name = model_type + '_' + targeted_table + "_" + targeted_field_name
                dimentions_feature_for_author = Vector_Operations.create_author_feature_for_each_dimention(
                    source_id_vector, feature_name, source_id,
                    self._window_start, self._window_end, self._prefix)
                authors_features = authors_features + dimentions_feature_for_author
            self._db.add_author_features(authors_features)
            self._db.session.commit()