def prepare_timeslots(self, start): configInst = getConfig() slot_length = int(getConfig().get("BoostAuthorsModel", "seconds_in_slot_unit")) seconds_in_slot_unit = int(getConfig().get("BoostAuthorsModel", "boost_jump")) window_size = datetime.timedelta(seconds=int(configInst.get("DEFAULT","window_analyze_size_in_sec"))) start = unicodedata.normalize('NFKD', start).encode('ascii','ignore') last_date = str_to_date(start)+window_size timeslots = list() current_slot_start = str_to_date(start) index = 1 while current_slot_start.date() < last_date.date() + datetime.timedelta(days=1): delta = datetime.timedelta(seconds=(slot_length*seconds_in_slot_unit)) delta = delta * index timeslot = Timeslot(current_slot_start, current_slot_start + delta - datetime.timedelta(seconds=1)) timeslots.append(timeslot) current_slot_start = current_slot_start + delta index = index + 1 return timeslots
def connect(dbapi_connection, connection_rec): dbapi_connection.enable_load_extension(True) if (getConfig().eval("OperatingSystem", "windows")): dbapi_connection.execute( 'SELECT load_extension("%s%s")' % (configInst.get("DB", "DB_path_to_extension"), '.dll')) if (getConfig().eval("OperatingSystem", "linux")): dbapi_connection.execute( 'SELECT load_extension("%s%s")' % (configInst.get("DB", "DB_path_to_extension"), '.so')) dbapi_connection.enable_load_extension(False)
def test_XML_importer_not_overwriting_bad_actor_collector(self): self._bad_actor_collector.crawl_bad_actors_followers() self._bad_actor_collector.crawl_bad_actors_retweeters() self.xml_importer.setUp() self.xml_importer.execute(getConfig().eval("DEFAULT", "start_date")) self.create_author_table.setUp() self.create_author_table.execute(getConfig().eval("DEFAULT", "start_date")) author = self.db.get_author_by_author_guid(u'5371821e-67b5-3582-bffb-b293b2554dda') self.assertTrue( author.xml_importer_insertion_date != None and author.bad_actors_collector_insertion_date != None) self.db.session.close()
def test_XML_importer_not_overwriting_bad_actor_collector(self): self._bad_actor_collector.crawl_bad_actors_followers() self._bad_actor_collector.crawl_bad_actors_retweeters() self.xml_importer.setUp() self.xml_importer.execute(getConfig().eval("DEFAULT", "start_date")) self.create_author_table.setUp() self.create_author_table.execute(getConfig().eval("DEFAULT", "start_date")) res = self.db.get_author_by_author_guid(compute_author_guid_by_author_name(u'adoula01')) author = res self.assertTrue( author.xml_importer_insertion_date != None and author.bad_actors_collector_insertion_date != None) self.db.session.close()
def setUp(self): ############################################################### # MODULES ############################################################### module_dict = {} module_dict["SyntaxFeatureGenerator"] = SyntaxFeatureGenerator module_dict["BehaviorFeatureGenerator"] = BehaviorFeatureGenerator module_dict["GraphFeatureGenerator_1"] = GraphFeatureGenerator module_dict[ "AccountPropertiesFeatureGenerator"] = AccountPropertiesFeatureGenerator module_dict["GraphFeatureGenerator_2"] = GraphFeatureGenerator module_dict[ "DistancesFromTargetedClassFeatureGenerator"] = DistancesFromTargetedClassFeatureGenerator module_dict[ "Word_Embeddings_Comparison_Feature_Generator"] = Word_Embeddings_Comparison_Feature_Generator module_dict[ "Word_Embeddings_Feature_Generator"] = Word_Embeddings_Feature_Generator module_dict[ "Word_Embedding_Differential_Feature_Generator"] = Word_Embedding_Differential_Feature_Generator module_dict["ClaimFeatureGenerator"] = ClaimFeatureGenerator # LinkPredictionFeatureExtractor must be the latest. Due to the deletion of features of the anchor authors. module_dict[ "LinkPredictionFeatureExtractor"] = LinkPredictionFeatureExtractor ############################################################### ## SETUP logging.config.fileConfig(getConfig().get("DEFAULT", "Logger_conf_file")) logging.info("Start Execution ... ") logging.info("SETUP global variables") window_start = getConfig().eval("DEFAULT", "start_date") logging.info("CREATE pipeline") authors = self._db.get_authors_by_domain(self._domain) posts = self._db.get_posts_by_domain(self._domain) graphs = {} parameters = {"authors": authors, "posts": posts, "graphs": graphs} for module in self._config_parser.sections(): if module_dict.get(module): if module.startswith( "GraphFeatureGenerator") or module.startswith( "DistancesFromTargetedClassFeatureGenerator"): self._add_graph_features_to_params(module, parameters) self._pipeline.append( module_dict.get(module)(self._db, **parameters))
def test_bad_actor_collector_not_overwriting_XML_importer(self): self.xml_importer.setUp() self.xml_importer.execute(getConfig().eval("DEFAULT", "start_date")) self.create_author_table.setUp() self.create_author_table.execute(getConfig().eval( "DEFAULT", "start_date")) self._bad_actor_collector.execute() res = self.db.get_author_by_author_guid_and_domain( u'5371821e67b53582bffbb293b2554dda', self._domain) author = res[0] self.assertTrue(author.xml_importer_insertion_date != None and author.bad_actors_collector_insertion_date != None) self.db.session.close()
def __init__(self, db, **kwargs): self._db = db self.config_parser = getConfig() start_date = self.config_parser.get("DEFAULT", "start_date").strip("date('')") self._window_start = datetime.datetime.strptime( start_date, '%Y-%m-%d %H:%M:%S') self._window_size = datetime.timedelta(seconds=int( self.config_parser.get("DEFAULT", "window_analyze_size_in_sec"))) self._window_end = self._window_start + self._window_size if 'authors' in kwargs and 'posts' in kwargs: self.authors = kwargs['authors'] self.author_guid_posts_dict = kwargs['posts'] else: raise Exception('Author object was not passed as parameter') if not self._db.is_export_key_authors_view_exist( ): # the required view,export_key_Authors, doesn't exist. logging.error( "Cannot initiate KeyAuthorScoreFeatureGenerator as the export_key_authors view does not appear in the db" ) self.module_enabled = False else: self.module_enabled = True self.sum_tfidf_dict = db.get_sum_tfidf_scores() self.max_tfidf_dict = db.get_max_tfidf_scores()
def get_pointer_score(self, pointer, total_ref_count, timeslots_accumulated_counts, timeslots): configInst = getConfig() seconds_in_slot_unit = int(configInst.get("BoostAuthorsModel", "seconds_in_slot_unit")) sorted_timeslots = sorted(timeslots_accumulated_counts.items()) p_timeslot = self.timeslotUtils.find_matching_timeslot(timeslots, str_to_date(pointer.date)) accum_p_timeslot = timeslots_accumulated_counts[p_timeslot] score = 0 for timeslot in reversed(sorted_timeslots): addition = timeslot[1] - accum_p_timeslot time_delta = abs(timeslot[0].end_time - p_timeslot.end_time).total_seconds() / seconds_in_slot_unit time_delta = math.pow(time_delta, 2.0) if addition==0 and time_delta==0: score += 0 else: score += float(addition)/time_delta if p_timeslot.start_time==timeslot[0].start_time and p_timeslot.end_time==timeslot[0].end_time: break return score
def testDoubleExecute(self): import sys sys.argv = [sys.argv[0], 'config.ini'] db = DB() db.setUp() db.execute(getConfig().get("DEFAULT","start_date")) getTablesQuerys=["select * from posts","select * from authors","select * from topics","select * from author_citations","select * from authors_boost_stats","select * from post_citations","select * from posts_representativeness","select * from posts_to_pointers_scores","select * from posts_to_topic","select * from visualization_windows"] listNumTablesRows=[] for tableQuery in getTablesQuerys: listNumTablesRows.append(db.session.execute(tableQuery).scalar()) db.setUp() db.execute(getConfig().get("DEFAULT","start_date")) listNumTablesRows2=[] for tableQuery in getTablesQuerys: listNumTablesRows2.append(db.session.execute(tableQuery).scalar()) self.assertListEqual(listNumTablesRows,listNumTablesRows2,"the two executions had different results")
def setUp(self): TestBase.setUp(self) self.config = getConfig() self._start_date = self.config.eval("DEFAULT", "start_date") #self._end_date = self.config.get("DEFAULT", "end_date") self._tsv_files_path = self.config.get("TumblrImporter", "tsv_test_files_account_properties_feature_generator") self._db = DB() self._db.setUp() self._tumblr_parser = TumblrImporter(self._db) self._tumblr_parser.setUp(self._tsv_files_path) self._tumblr_parser.execute() self._author_guid = "f0f4bb42-3fed-322a-b71a-681179d47ea1" authors = self._db.get_authors_by_domain(Domains.MICROBLOG) posts = self._db.get_posts_by_domain(Domains.MICROBLOG) parameters = {"authors": authors, "posts": posts} account_properties_feature_generator = AccountPropertiesFeatureGenerator(self._db, **parameters) account_properties_feature_generator.execute() self._author_features = self._db.get_author_features_by_author_guid(author_guid=self._author_guid) self._author_features_dict = self._create_author_features_dictionary(self._author_features)
def setUp(self): TestBase.setUp(self) self.config = getConfig() self._tsv_files_path = self.config.get( "TumblrImporter", "tsv_test_files_key_author_score_feature_generator") self._db = DB() self._db.setUp() self._tumblr_parser = TumblrImporter(self._db) self._author_guid = "150ff707-a6eb-3051-8f3c-f623293c714b" self._tumblr_parser.setUp(self._tsv_files_path) self._tumblr_parser.execute() autotopic_executor = AutotopicExecutor(self._db) autotopic_executor.setUp() autotopic_executor.execute() key_author_model = KeyAuthorsModel(self._db) key_author_model.setUp() key_author_model.execute() authors = self._db.get_authors_by_domain(Domains.MICROBLOG) posts = self._db.get_posts_by_domain(Domains.MICROBLOG) parameters = {"authors": authors, "posts": posts} self._key_author_score_feature_generator = KeyAuthorScoreFeatureGenerator( self._db, **parameters) self._key_author_score_feature_generator.execute() self._author_features = self._db.get_author_features_by_author_guid( author_guid=self._author_guid) self._author_features_dict = self._create_author_features_dictionary( self._author_features)
def setUp(self): TestBase.setUp(self) self.config = getConfig() self._start_date = self.config.eval("DEFAULT", "start_date") #self._end_date = self.config.get("DEFAULT", "end_date") self._tsv_files_path = self.config.get( "TumblrImporter", "tsv_test_files_graph_feature_generator") self._db = DB() self._db.setUp() self._tumblr_parser = TumblrImporter(self._db) self._tumblr_parser.setUp(self._tsv_files_path) self._tumblr_parser.execute() self._author_guid = u"f0f4bb42-3fed-322a-b71a-681179d47ea1" authors = self._db.get_authors_by_domain(Domains.MICROBLOG) posts = self._db.get_posts_by_domain(Domains.MICROBLOG) parameters = {"authors": authors, "posts": posts} graph_types = self.config.eval("GraphFeatureGenerator_1", "graph_types") algorithms = self.config.eval("GraphFeatureGenerator_1", "algorithms") aggregations = self.config.eval("GraphFeatureGenerator_1", "aggregation_functions") neighborhood_sizes = self.config.eval("GraphFeatureGenerator_1", "neighborhood_sizes") distances_from_labeled_authors = self.config.eval( "GraphFeatureGenerator_1", "distances_from_labeled_authors") graph_directed = self.config.eval("GraphFeatureGenerator_1", "graph_directed") graph_weights = self.config.eval("GraphFeatureGenerator_1", "graph_weights") parameters.update({ "graph_types": graph_types, "algorithms": algorithms, "aggregation_functions": aggregations, "neighborhood_sizes": neighborhood_sizes, "graph_directed": graph_directed, "graph_weights": graph_weights, "distances_from_labeled_authors": distances_from_labeled_authors }) graph_feature_generator = GraphFeatureGenerator(self._db, **parameters) graph_feature_generator.execute() self._author_features = self._db.get_author_features_by_author_guid( author_guid=self._author_guid) self._author_features_dict = self._create_author_features_dictionary( self._author_features)
def __init__(self, db, **kwargs): super(BaseFeatureGenerator, self).__init__(db) self._load_limit = self._config_parser.eval("FeatureExtractor", "load_limit") self._max_objects_save = self._config_parser.eval("FeatureExtractor", "max_objects_save") # self._features_names_count = 0 self._db = db self._config_parser = getConfig() self._targeted_social_network = self._config_parser.get("DEFAULT", "social_network_name") start_date = self._config_parser.get("DEFAULT", "start_date").strip("date('')") self._window_start = datetime.datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S') self._window_size = datetime.timedelta( seconds=int(self._config_parser.get("DEFAULT", "window_analyze_size_in_sec"))) # self._window_end = self._window_start + self._window_size self._domain = str(self._config_parser.get(self.__class__.__name__, "domain")) if 'authors' in kwargs and 'posts' in kwargs: self.authors = kwargs['authors'] self.author_guid_posts_dict = kwargs['posts'] else: raise Exception('Author object was not passed as parameter') if 'measure' in kwargs: self._measure = kwargs['measure'] if 'calculator_type' in kwargs: self._calculator_type = kwargs['calculator_type'] if 'aggregation_function' in kwargs: self._aggregation_function = kwargs['aggregation_function'] if 'graph_type' in kwargs: self._graph_type = kwargs['graph_type'] if 'targeted_class_field_name' in kwargs: self._targeted_class_field_name = kwargs['targeted_class_field_name']
def __init__(self, db, **kwargs): self.config_parser = getConfig() start_date = self.config_parser.get("DEFAULT", "start_date").strip("date('')") self._window_start = datetime.datetime.strptime( start_date, '%Y-%m-%d %H:%M:%S') self._window_size = datetime.timedelta(seconds=int( self.config_parser.get("DEFAULT", "window_analyze_size_in_sec"))) self._window_end = self._window_start + self._window_size self._db = db self._targeted_classes = self.config_parser.eval( "DEFAULT", "targeted_classes") if 'authors' in kwargs: self._authors = kwargs['authors'] self._author_dict = self._create_author_dictionary(self._authors) else: raise Exception('Author object was not passed as parameter') if kwargs.viewkeys() >= { 'graph_types', 'algorithms', 'aggregation_functions', 'neighborhood_sizes', 'graph_weights', 'graph_directed' }: self.set_graph_vars(kwargs) else: raise Exception( 'Graph parameters for feature generation are missing or incomplete' )
def execute(self): start_time = time.time() info_msg = "execute started for " + self.__class__.__name__ logging.info(info_msg) total_authors = len(self.authors) processed_authors = 0 features = getConfig().eval(self.__class__.__name__, "feature_list") authors_features = [] for author in self.authors: author_guid = author.author_guid if author_guid in self.author_guid_posts_dict.keys(): posts = self.author_guid_posts_dict[str(author.author_guid)] getattr(self, 'cleanUp')() for feature in features: author_feature = self.run_and_create_author_feature( author, feature, posts, author_guid, feature) authors_features.append(author_feature) processed_authors += 1 print("\r processed authors " + str(processed_authors) + " from " + str(total_authors), end="") if authors_features: self.submit_author_features_to_db(authors_features) end_time = time.time() diff_time = end_time - start_time print('execute finished in ' + str(diff_time) + ' seconds')
def execute(self, window_start): win_analyze = datetime.timedelta(seconds=int(getConfig().get( "DEFAULT", "window_analyze_size_in_sec"))) window_end = window_start + win_analyze posts_data = self._db.get_posts_data(window_start, window_end) key_posts_score = self._db.get_key_posts_score(window_start, window_end) key_authors_score = self._db.get_key_authors_score( window_start, window_end) topics_data = self._db.get_topics_data(window_start, window_end) posts_references_count = self._db.get_reference_count( window_start, window_end) output = [] for post_id in posts_data: post_data = [] post_data = post_data + self.get_post_data(posts_data[post_id]) post_data = post_data + self.get_post_score( key_posts_score, post_id) post_data = post_data + self.get_author_score( key_authors_score, post_id) post_data = post_data + self.sample_key_posts_score() post_data = post_data + self.get_topic_data(topics_data, post_id) post_data = post_data + self.get_posts_references_count( posts_references_count, post_id) post_data = post_data + self.get_post_references(post_id) output = output + [post_data] self.write_posts_to_csv(output)
def __init__(self, db, **kwargs): #AbstractController.__init__(self, db) self._db = db self._config_parser = getConfig() self._targeted_social_network = self._config_parser.get( "DEFAULT", "social_network_name") start_date = self._config_parser.get("DEFAULT", "start_date").strip("date('')") self._window_start = datetime.datetime.strptime( start_date, '%Y-%m-%d %H:%M:%S') self._window_size = datetime.timedelta(seconds=int( self._config_parser.get("DEFAULT", "window_analyze_size_in_sec"))) self._window_end = self._window_start + self._window_size self._domain = unicode( self._config_parser.get(self.__class__.__name__, "domain")) if 'authors' in kwargs and 'posts' in kwargs: self.authors = kwargs['authors'] self.author_guid_posts_dict = kwargs['posts'] else: raise Exception('Author object was not passed as parameter') if kwargs.has_key('measure'): self._measure = kwargs['measure'] if kwargs.has_key('calculator_type'): self._calculator_type = kwargs['calculator_type'] if kwargs.has_key('aggregation_function'): self._aggregation_function = kwargs['aggregation_function'] if kwargs.has_key('graph_type'): self._graph_type = kwargs['graph_type'] if kwargs.has_key('targeted_class_field_name'): self._targeted_class_field_name = kwargs[ 'targeted_class_field_name']
def setUp(self): self.config = getConfig() self._db = DB() self._db.setUp() self.timeline_overlap = TimelineOverlapVisualizationGenerator() author1 = Author() author1.name = 'acquired_user' author1.domain = 'Microblog' author1.author_guid = 'acquired_user' author1.author_screen_name = 'acquired_user' author1.author_full_name = 'acquired_user' author1.author_osn_id = 1 author1.created_at = datetime.datetime.now() author1.missing_data_complementor_insertion_date = datetime.datetime.now( ) author1.xml_importer_insertion_date = datetime.datetime.now() author1.author_type = 'bad_actor' author1.author_sub_type = 'acquired' self._db.add_author(author1) for i in range(1, 11): post1 = Post() post1.post_id = 'bad_post' + str(i) post1.author = 'acquired_user' post1.guid = 'bad_post' + str(i) post1.date = datetime.datetime.now() post1.domain = 'Microblog' post1.author_guid = 'acquired_user' post1.content = 'InternetTV love it' + str(i) post1.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post1) author = Author() author.name = 'TestUser1' author.domain = 'Microblog' author.author_guid = 'TestUser1' author.author_screen_name = 'TestUser1' author.author_full_name = 'TestUser1' author.author_osn_id = 2 author.created_at = datetime.datetime.now() author.missing_data_complementor_insertion_date = datetime.datetime.now( ) author.xml_importer_insertion_date = datetime.datetime.now() self._db.add_author(author) for i in range(1, 11): post = Post() post.post_id = 'TestPost' + str(i) post.author = 'TestUser1' post.guid = 'TestPost' + str(i) post.date = datetime.datetime.now() post.domain = 'Microblog' post.author_guid = 'TestUser1' post.content = 'InternetTV love it' + str(i) post.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post) self._db.commit()
def __init__(self, db): PostImporter.__init__(self, db) config_parser = getConfig() self.xmlPath = config_parser.get(self.__class__.__name__, "xml_path") # self.xmlPath = configInst.get(self.__class__.__name__,"XMDL_source_path") self.fileName = None self.CurrFolderPath = None
def __init__(self, db, query): # AbstractController.__init__(self, db) self._db = db #self._keys = keys self._webcrawlers_client = WebCrawlersClient() self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._query = query
def __init__(self, db, keys, query): # AbstractController.__init__(self, db) self._db = db self._keys = keys self._newsapi_client = NewsApiClient(self._keys) self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._query = query
def __init__(self, db): self._db = db self._config_parser = getConfig() self._commit_threshold = 500 self._yelp_api = YelpAPI() self._radius = 100 self._print_threshold = 30
def __init__(self, db): # AbstractController.__init__(self, db) self._db = db self._twitter_rest_api = Twitter_Rest_Api(db) self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._users_to_add = [] self._post_to_add = []
def __init__(self, db): PostImporter.__init__(self, db) config_parser = getConfig() self.start_date = config_parser.eval("DEFAULT", "start_date") self.end_date = config_parser.eval("DEFAULT", "end_date") self._data_folder = self._config_parser.eval(self.__class__.__name__, "data_folder") self._bad_actor_threshold = self._config_parser.eval( self.__class__.__name__, "bad_actor_threshold")
def setUp(self): # TestBase.setUp(self) self.config = getConfig() self._db = DB() self._db.setUp() self._tumblr_importer = TumblrImporter(self._db) self._tumblr_importer.execute()
def setUp(self): self._config_parser = getConfig() self._domain = unicode(self._config_parser.get("DEFAULT", "domain")) self._db = DB() self._db.setUp() self._clear_stractures() self._counter = 1
def setUp(self): self.config = getConfig() self._db = DB() self._db.setUp() self._ranked_authors_exporter = RankedAuthorsExporter(self._db) self._author_guid_dict = {} self.make_authors_posts_and_connections() self.csv_location = self.config.eval('RankedAuthorsExporter', 'output_file_path') self._threshold = self.config.eval('RankedAuthorsExporter', 'threshold')
def test_umlaut_chars_in_authors_names(self): ''' Tests that authors' names that contain umlaut are encoded properly, i.e. o with umlaut => o, u wit umlaut => u etc. ''' config_parser = getConfig().get_config_parser() xmlPath = config_parser.get("XMLImporter", "xml_source_path") listdic = self.xml_importer.parseXMLsToListdict(xmlPath) author_name = listdic[0][u'author'] self.assertFalse(u'\xf6' in author_name, "Author name contain umlaut - unlaut wasn't encoded properly")
def setUp(self): self._config_parser = getConfig() self._db = DB() self._db.setUp() self._model = Word_Embedding_Differential_Feature_Generator(self._db) self._posts = [] self._author = None self._set_author(u'test_user')
def __init__(self): config_parser = getConfig() logging.config.fileConfig(getConfig().get("DEFAULT", "logger_conf_file")) self._db = DB() self._db.setUp() self._acquired_bad_authors = [] self._suspected_authors = [] self.common_posts_threshold = config_parser.eval( self.__class__.__name__, "common_posts_threshold") self.output_path = config_parser.eval(self.__class__.__name__, "output_path") self.output_dir = config_parser.eval(self.__class__.__name__, "output_dir") # self.unlabeled_prediction_threshold = config_parser.eval(self.__class__.__name__, "unlabeled_prediction_threshold") if not os.path.exists(self.output_path + "/" + self.output_dir): os.makedirs(self.output_path + "/" + self.output_dir) self._source_author_destination_author_num_of_mutual_posts_dict = {}