def create_distance_features(self, author_id, aggregation_function, word_embedding_vector1, dif2_word_embedding,
                                 target1, target2):
        distance_features = []
        for distance_function in self._distance_functions:

            feature_name = u'word_embeddings_differential_' + u"distance_function_" + distance_function + '_' + target1[
                'table_name'] + "_" + target1['targeted_field_name'] + "_" + str(aggregation_function) + "_TO_" \
                           + target2['table_name'] + "_" + target2['targeted_field_name'] + "_" + str(
                aggregation_function)

            attribute_value = Vector_Operations.oparate_on_two_vectors(commons, distance_function,
                                                                       word_embedding_vector1,
                                                                       dif2_word_embedding)
            feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id,
                                                                 attribute_value,
                                                                 self._window_start,
                                                                 self._window_end)
            distance_features.append(feature)
        return distance_features
 def create_authors_feature_from_two_vectors(func, first_author_vector_dict, second_author_vector_dict,
                                             first_table_name,
                                             first_targeted_field_name, first_word_embedding_type, second_table_name,
                                             second_targeted_field_name, second_word_embedding_type, window_start,
                                             window_end, prefix=u''):
     authors_features = []
     for author_id in first_author_vector_dict.keys():
         feature_name = prefix + u'subtraction_'+first_table_name + "_" + first_targeted_field_name + "_" + first_word_embedding_type + "_TO_" \
                        + second_table_name + "_" + second_targeted_field_name + "_" + second_word_embedding_type + "_DISTANCE-FUNCTION_" + func
         first_vector = first_author_vector_dict[author_id]
         second_vector = second_author_vector_dict[author_id]
         # attribute_value = getattr(commons.commons, func)(first_vector, second_vector
         attribute_value = Vector_Operations.oparate_on_two_vectors(commons.commons, func,
                                                                    first_vector,
                                                                    second_vector)
         feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id, attribute_value,
                                                              window_start,
                                                              window_end)
         authors_features.append(feature)
     return authors_features
Example #3
0
    def calc_avg_known_words(self, source_id, **kwargs):
        destination_target_fields = kwargs['target']
        features = []
        logging.info("processing author " + source_id)
        for word_list_name in self.word_lists_names:
            self._load_known_words_to_dict(word_list_name)
            try:
                result = self._count_avg_known_words(destination_target_fields)
                window_start = self._window_start
                window_end = self._window_end
                attribute_name = str(self.__class__.__name__ +
                                     '_count_avg_known_word_from_' +
                                     word_list_name)
                author_feature = BaseFeatureGenerator.create_author_feature(
                    attribute_name, source_id, result, window_start,
                    window_end)
                features.append(author_feature)

            except Exception as e1:
                info_msg = e1.message
                logging.error(info_msg + word_list_name)
        return features
    def execute(self, window_start=None):
        # Logger setup.
        start_time = time.time()
        info_msg = "execute started for Cooperation topic feature generator started at " + str(
            start_time)
        logging.info(info_msg)
        claims = self._db.get_claims()
        logging.info("Cooperation execute window_start %s" %
                     self._window_start)

        try:

            # Claims => for each claim, get Posts. => for all posts, create Id to post words dictionary
            # includes stamming, stopwords. => 'calculate_topics' creates the bacg of words.

            claim_features = []
            posts_dict = self._db.get_claim_id_posts_dict()

            for cnt, claim in enumerate(claims):

                claim_id = claim.claim_id
                logging.info('Started ' + str(cnt + 1) + ' claim from ' +
                             str(len(claims)) + ' claims')
                posts_list = posts_dict[claim_id]

                if len(posts_list) == 0:
                    logging.info('The resulted list is empty for claim: ' +
                                 str(claim_id))
                    continue

                post_id_to_words = self._create_post_id_to_content_words(
                    posts_list)
                post_id_to_strings_no_urls = self._create_post_id_to_strings_no_urls(
                    posts_list)
                authors_counter_dic1 = self.calculate_topics_similarity(
                    post_id_to_words)
                authors_counter_dic2 = self.calculate_topics_exact_match(
                    post_id_to_strings_no_urls)

                for ftr, feature_name in enumerate(self._features_list):
                    logging.info('Started ' + str(ftr + 1) + ' feature from ' +
                                 str(len(self._features_list)) + ' features')

                    try:
                        attribute_value1 = float(
                            getattr(self, feature_name)(authors_counter_dic1))
                        attribute_value2 = float(
                            getattr(self, feature_name)(authors_counter_dic2))
                    except:
                        attribute_value1 = -1.0
                        attribute_value2 = -1.0
                        print('Fail in extraction: ' + feature_name)

                    if attribute_value1 is not None and attribute_value2 is not None:
                        attribute_name1 = "{0}_{1}".format(
                            self._prefix, feature_name)
                        attribute_name2 = "{0}_{1}".format(
                            self._prefix, "exact_match_" + feature_name)

                        # next line add envelope for feature
                        claim_feature1 = BaseFeatureGenerator.create_author_feature(
                            attribute_name1, claim_id, attribute_value1,
                            self._window_start, self._window_end)
                        claim_feature2 = BaseFeatureGenerator.create_author_feature(
                            attribute_name2, claim_id, attribute_value2,
                            self._window_start, self._window_end)
                        claim_features.append(claim_feature1)
                        claim_features.append(claim_feature2)
                        print('Appended: ' + attribute_name1)
                        print('Appended: ' + attribute_name2)

                for ftr, feature_name in enumerate(
                        self._author_count_features_list):
                    logging.info('Started ' + str(ftr + 1) + ' feature from ' +
                                 str(len(self._author_count_features_list)) +
                                 ' features')
                    attribute_name1 = "{0}_{1}".format(self._prefix,
                                                       feature_name)
                    attribute_name2 = "{0}_{1}".format(
                        self._prefix, "exact_match_" + feature_name)
                    for author_id in authors_counter_dic1:
                        attribute_value1 = authors_counter_dic1[author_id]
                        attribute_value2 = authors_counter_dic2[author_id]
                        if attribute_value1 is not None and attribute_value2 is not None:
                            author_feature1 = BaseFeatureGenerator.create_author_feature(
                                attribute_name1, author_id, attribute_value1,
                                self._window_start, self._window_end)
                            author_feature2 = BaseFeatureGenerator.create_author_feature(
                                attribute_name2, author_id, attribute_value2,
                                self._window_start, self._window_end)
                            claim_features.append(author_feature1)
                            claim_features.append(author_feature2)

        except Exception as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
            logging.error('Failed in extraction process!')
        stop_time = time.time()
        info_msg = "execute ended at " + str(stop_time)

        logging.info(info_msg)
        self._db.add_author_features(claim_features)
Example #5
0
 def __init__(self, db, **kwargs):
     BaseFeatureGenerator.__init__(self, db, **kwargs)
     self._connection_types = self._config_parser.eval(
         self.__class__.__name__, "connection_types")
     self._similarity_functions = self._config_parser.eval(
         self.__class__.__name__, "similarity_functions")
Example #6
0
 def __init__(self, db, **kwargs):
     BaseFeatureGenerator.__init__(self, db, **kwargs)
     self._post_id_text_image_dict = self._create_post_id_text_image()
Example #7
0
 def __init__(self, db, **kwargs):
     BaseFeatureGenerator.__init__(self, db, **{'authors': [], 'posts': {}})
     self._features = self._config_parser.eval(self.__class__.__name__,
                                               "feature_list")
Example #8
0
    def execute(self, window_start=None):

        self._claim_id_claim_type_dict = self._create_claim_id_claim_type_dictionary(
        )

        directory_names = os.listdir(self._input_path)
        for dataset_name in directory_names:
            self._dataset_name = dataset_name
            #with or without retweets
            aggregation_retweets_type_directories = os.listdir(
                self._input_path + self._dataset_name)
            for aggregation_retweets_type in aggregation_retweets_type_directories:
                self._aggregation_retweets_type = aggregation_retweets_type
                target_path = "{0}/{1}/{2}/".format(self._input_path,
                                                    self._dataset_name,
                                                    aggregation_retweets_type)

                # read post_id_topic_id_dict_file
                with open(target_path +
                          self._post_id_topic_id_dict_file) as file:
                    self._post_id_topic_id_dict = json.load(file)

                self._topic_id_post_id_dict = {
                    value: key
                    for key, value in self._post_id_topic_id_dict.iteritems()
                }

                self._topic_statistics_df = pd.read_csv(
                    target_path + self._topic_statistics_file)

                author_features = []
                for index, row in self._topic_statistics_df.iterrows():
                    df_tuple = tuple(row)
                    for i, feature_name in enumerate(
                            self._features_extracted_by_tuple):
                        msg = "\rCalculating features: [{0}/{1}: {2} {3} {4}]]".format(
                            i, len(self._features_extracted_by_tuple),
                            feature_name, self._dataset_name,
                            self._aggregation_retweets_type)
                        print(msg, end="")

                        post_id, attribute_name, attribute_value = getattr(
                            self, feature_name)(df_tuple)
                        author_feature = BaseFeatureGenerator.create_author_feature(
                            attribute_name, post_id, attribute_value,
                            self._window_start, self._window_end)
                        author_features.append(author_feature)

                for j, aggregated_feature in enumerate(
                        self._aggregated_features):
                    msg = "\rCalculating aggregated features: [{0}/{1} {2} {3} {4}]]".format(
                        j, len(self._aggregated_features), aggregated_feature,
                        self._dataset_name, self._aggregation_retweets_type)
                    print(msg, end="")

                    for topic_id, post_id in self._topic_id_post_id_dict.items(
                    ):
                        attribute_value = getattr(self,
                                                  aggregated_feature)(topic_id)
                        attribute_name = "{0}_{1}_{2}_{3}_{4}".format(
                            self._prefix, aggregated_feature,
                            self._dataset_name,
                            self._aggregation_retweets_type,
                            self._classifier_name)
                        author_feature = BaseFeatureGenerator.create_author_feature(
                            attribute_name, post_id, attribute_value,
                            self._window_start, self._window_end)
                        author_features.append(author_feature)

                self._db.add_author_features(author_features)
Example #9
0
 def execute(self, window_start=None):
     function_name = 'extract_temporal_features'
     start_time = time.time()
     info_msg = "execute started for " + function_name + " started at " + str(
         start_time)
     logging.info(info_msg)
     claims = self._db.get_claims()
     try:
         claim_features = []
         today_datetime = datetime.datetime.now()
         posts_dict = self._db.get_claim_id_posts_dict()
         for cnt, claim in enumerate(claims):
             claim_id = claim.claim_id
             logging.info('Started ' + str(cnt + 1) + ' claim from ' +
                          str(len(claims)) + ' claims')
             for source in self._source_list:
                 # define authors,posts per claim
                 if source == 'authors':
                     s_list = self._db.get_claim_authors(claim_id)
                 elif source == 'posts':
                     s_list = posts_dict[claim_id]
                 if len(s_list) == 0:
                     logging.info('The resulted list is empty for claim:' +
                                  str(claim_id))
                     continue
                 ll = []
                 for s in s_list:
                     try:
                         if source == 'authors':
                             created_at = s[43]
                         elif source == 'posts':
                             created_at = s.created_at
                         if created_at is not None:
                             creation_date = parser.parse(created_at)
                             delta = int(
                                 divmod((today_datetime -
                                         creation_date).total_seconds(),
                                        60)[0])
                             ll.append(delta)
                         else:
                             logging.info(
                                 'Can not be created feature for ' +
                                 created_at)
                     except:
                         logging.info(
                             'Can not be parsed created_at, probably None value'
                         )
                         pass
                 # normalization
                 m = min(ll)
                 lls = [i - m for i in ll]
                 #sorting in ascending
                 lls.sort()
                 #init start stop indexes
                 st_ind = 0
                 stop_ind = 1
                 for delta in self._delta_time:
                     for idx, val in enumerate(lls[st_ind:]):
                         if val <= eval(delta):
                             stop_ind = idx
                     llsn = deepcopy(lls[st_ind:stop_ind])
                     st_ind = stop_ind
                     for ftr, feature_name in enumerate(
                             self._features_list):
                         logging.info('Started ' + str(ftr + 1) +
                                      ' feature from ' +
                                      str(len(self._features_list)) +
                                      ' features')
                         try:
                             attribute_value = getattr(self,
                                                       feature_name)(llsn)
                         except:
                             attribute_value = 0
                             print('Fail in extraction: ' + feature_name)
                         if attribute_value is not None:
                             attribute_name = "{0}_{1}_{2}_{3}".format(
                                 self._prefix, source, str(eval(delta)),
                                 feature_name)
                             # next line add envelope for feature
                             claim_feature = BaseFeatureGenerator.create_author_feature(
                                 attribute_name, claim_id, attribute_value,
                                 self._window_start, self._window_end)
                             claim_features.append(claim_feature)
                             print('Appended: ' + attribute_name)
     except:
         logging.error('Failed in extraction process!')
     stop_time = time.time()
     info_msg = "execute ended at " + str(stop_time)
     logging.info(info_msg)
     self._db.add_author_features(claim_features)
Example #10
0
    def execute(self, window_start=None):
        function_name = "extract_features_from_graph"
        start_time = time.time()
        info_msg = "execute started for " + function_name + " started at " + str(
            start_time)
        logging.info(info_msg)
        try:
            claim_features = []
            if self._csv_file != ' ':
                logging.info(
                    'Getting existing author connections from csv file')
                df = pd.read_csv(self._csv_file,
                                 names=[
                                     'source_author_guid',
                                     'destination_author_guid',
                                     'connection_type', 'weight', 'claim_id',
                                     'insertion_date'
                                 ],
                                 low_memory=False)
            else:
                logging.info(
                    'Getting existing author connections with claim_id...')
                author_connections_with_claim_id = self._db.get_author_connections_with_claim_id(
                )
                logging.info('Checking author connections for claim id...')
                author_connections_with_claim_id.extend(
                    self._db.make_connections_with_claim_id()[0])
                logging.info(author_connections_with_claim_id[0])
                list_of_con_dicts = []
                for author_con in author_connections_with_claim_id:
                    connections_dict = {
                        'source_author_guid': author_con.source_author_guid,
                        'destination_author_guid':
                        author_con.destination_author_guid,
                        'connection_type': author_con.connection_type,
                        'weight': author_con.weight,
                        'claim_id': author_con.claim_id,
                        'insertion_date': author_con.insertion_date
                    }
                    list_of_con_dicts.append(connections_dict)
                df = pd.DataFrame(list_of_con_dicts)

            grps = df.groupby(self._group_by)

            for cnt, grp in enumerate(grps):
                logging.info('Started ' + str(cnt) + ' group from ' +
                             str(len(grps)) + ' groups')
                if nx.__version__[0] == '1':
                    G = nx.from_pandas_dataframe(grp[1], self._source[0],
                                                 self._target[0])
                else:
                    G = nx.from_pandas_edgelist(grp[1], self._source[0],
                                                self._target[0])
                claim_ext_id = grp[0]
                #claim_id = self._db.claim_ext_id_to_claim_id(claim_ext_id)[0]
                claim_id = claim_ext_id
                if nx.__version__[
                        0] > 1 and 'communicability_centrality' in self._features_list:
                    self._features_list.remove('communicability_centrality')
                for ftr, feature_name in enumerate(self._features_list):
                    logging.info('Started ' + str(ftr + 1) + ' feature from ' +
                                 str(len(self._features_list)) + ' features')
                    attributes_dict = getattr(self,
                                              function_name)(G=G,
                                                             ff=feature_name)
                    if len(attributes_dict) == 1 and attributes_dict[
                            feature_name] is not None:
                        attribute_name = "{0}_{1}".format(
                            self._prefix, feature_name)
                        # next line add envelope for feature
                        claim_feature = BaseFeatureGenerator.create_author_feature(
                            attribute_name, claim_id,
                            attributes_dict[feature_name], self._window_start,
                            self._window_end)
                        claim_features.append(claim_feature)
                        continue

                    for sub_feature_name in ('min_', 'max_', 'median_',
                                             'std_'):
                        attribute_value = attributes_dict[sub_feature_name +
                                                          feature_name]
                        if attribute_value is not None:
                            attribute_name = "{0}_{1}".format(
                                self._prefix, sub_feature_name + feature_name)
                            # next line add envelope for feature
                            claim_feature = BaseFeatureGenerator.create_author_feature(
                                attribute_name, claim_id, attribute_value,
                                self._window_start, self._window_end)
                            claim_features.append(claim_feature)
        except:
            logging.info('Fail')
            print(sys.exc_info())
        stop_time = time.time()
        info_msg = "execute ended at " + str(stop_time)
        logging.info(info_msg)
        # used author_feature table
        self._db.add_author_features(claim_features)
 def __init__(self, db, **kwargs):
     BaseFeatureGenerator.__init__(self, db, **kwargs)
     self._targeted_author_word_embeddings = self._config_parser.eval(
         self.__class__.__name__, "targeted_author_word_embeddings")
     self._max_objects_without_saving = self._config_parser.eval(
         self.__class__.__name__, "max_objects_without_saving")