def create_umlaut_author(self): umlaut_author = Author() umlaut_author.name = self.umlaut_author_name umlaut_author.domain = self._domain umlaut_author.author_guid = self.author_guid umlaut_author.author_full_name = self.umlaut_author_full_name umlaut_author.description = self.umlaut_author_description umlaut_author.language = self.umlaut_language umlaut_author.location = self.umlaut_location return umlaut_author
def _create_author(self, guid, author_type): author = Author() author.name = unicode(guid) author.domain = u'Microblog' author.author_guid = unicode(guid) author.author_screen_name = u'TestUser1' author.author_type = author_type author.domain = u'Restaurant' author.author_osn_id = 1 self._authors_to_author_features_dict[author.author_guid]=[] self._db.add_author(author)
def create_dummy_author(self): name = unicode(self.name) domain = unicode(self._domain) author_guid = unicode(self.author_guid) author_full_name = u"author_full_name" author_osn_id = unicode(generate_random_guid()) created_at = u"2016-08-24 10:00" statuses_count = 11 followers_count = 12 favourites_count = 14 friends_count = 15 listed_count = 16 description = u"description" language = u"English" location = u"Beer Sheva" time_zone = u"Israel" url = u"http://google.com" author = Author(name=name, domain=domain, author_guid=author_guid, author_full_name=author_full_name, author_osn_id=author_osn_id, created_at=created_at, statuses_count=statuses_count, followers_count=followers_count, favourites_count=favourites_count, friends_count=friends_count, listed_count=listed_count, description=description, language=language, location=location, time_zone=time_zone, url=url) return author
def _add_author(self, author_guid, type=u"good_actor"): author = Author() author.author_guid = author_guid author.author_full_name = author_guid author.author_screen_name = author_guid author.name = author_guid author.domain = u'Microblog' author.author_type = type self._db.add_author(author) self._authors.append(author)
def _convert_group_to_author(self): """ Method takes given group_id from config.ini and creates Author object for it. """ author = Author() author.name = self.get_group_name() author.author_osn_id = self._group_id author.author_guid = commons.compute_author_guid_by_osn_id( author.author_osn_id) author.domain = self._domain author.author_type = "Group" author.followers_count = self.get_group_number_of_members() author.author_sub_type = self.get_group_level_of_activity() return author
def _convert_source_to_author(self, source_id, targeted_fields_dict): source_table_name = targeted_fields_dict['source']['table_name'] source_table_id = targeted_fields_dict['source']['id'] elements = self._db.get_table_elements_by_ids(source_table_name, source_table_id, [source_id]) temp_author = elements[0] if isinstance(temp_author, Author): author = temp_author elif source_table_id == u"author_guid": author = self._db.get_author_by_author_guid(source_id) elif hasattr(temp_author, u"author_guid"): author = self._db.get_author_by_author_guid(getattr(temp_author, u"author_guid")) else: author = Author() author.author_guid = source_id author.statuses_count = len(targeted_fields_dict) if hasattr(temp_author, 'created_at'): author.created_at = temp_author.created_at return author
def create_authors_for_assigning_tests(self): self.autors_sub_type_author_guid_dict = { 'private': u'6343dc3298343d4780f6242dd553a2fd', 'company': u'0db2b25a46203c589db61818cb3bac49', 'news_feed':u'37e0df45b746342c9c7d80a49e565354', 'spammer':u'01b5059b6db33133a3a44fb2b8fc3cc2', 'bot':u'042763a891aa3ec4bd613f5fe34df71c', 'acquired':u'6be7d7a96bd43afabf40f041044fea9e', 'crowdturfer':u'6c952b965d7d375192e0107f62cb7f38' } for sub_type in self.autors_sub_type_author_guid_dict: author = Author() author.name = unicode(sub_type) author.domain = unicode(self._domain) author.author_guid = unicode(self.autors_sub_type_author_guid_dict[sub_type]) self.db.add_author(author) self.db.session.commit()
def extract_author(self, data): author = Author() author.name = str(data['source']) author.domain = 'BuzzFeed' author.author_guid = compute_author_guid_by_author_name(data['source']) author.author_screen_name = author.name author.author_full_name = author.name return author
def _create_author(self, post, dataset_affiliation): author = Author() author_name = post.author author.name = author_name author.author_screen_name = author_name author_guid = compute_author_guid_by_author_name(author_name) author.author_osn_id = author_guid author.author_guid = author_guid author.domain = self._domain author.author_type = post.post_type author.notifications = dataset_affiliation return author
def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.name = u'test' author.domain = u'tests' author.statuses_count = 0 self._db.add_author(author) self._author = author
def _create_author_by_citation(self, reblogged_from_metadata): author = Author() parent_post_blog_id = reblogged_from_metadata["parent_post_blog_id"] author.author_osn_id = parent_post_blog_id author.name = parent_post_blog_id author.author_guid = compute_author_guid_by_author_name(author.name) parent_post_blog_name = self._convert_to_unicode_value(reblogged_from_metadata["parent_post_blog_name"]) author.author_screen_name = parent_post_blog_name author.url = self._convert_to_unicode_value(reblogged_from_metadata["parent_post_short_url"]) author.domain = self._domain return author
def save_to_db(self): self._db.addPosts(self._posts) self._db.add_claim_connections(self._claim_tweet_connections) self._db.add_claim_connections(self._post_comment_connections) authors = [] reddit_authors = [] for i, redditor in enumerate(set(self._redditors)): for attempt in xrange(self._number_of_attempts): try: self._retrive_reddit_author(authors, i, reddit_authors, redditor) print("\rretrive reddit author {0}/{1}".format(i, len(self._redditors)), end='') except prawcore.exceptions.ServerError as e: print('Server overload code 503, save to DB and sleep 30 sec and try again') self.save_to_db() time.sleep(5) # 30) except Exception as e: print('\r retrive reddit author {0}/{1} exception: {2}'.format(i, len(self._redditors), e.message), end='') print() for i, redditor in enumerate(set(self._deleted_redditors)): author = Author() author.name = "deleted" author.author_guid = compute_author_guid_by_author_name(redditor) author.domain = u'reddit' author.author_type = u'deleted' authors.append(author) self._db.add_authors_fast(authors) self._db.add_reddit_authors(reddit_authors) self._posts = [] self._claim_tweet_connections = [] self._redditors = [] self._deleted_redditors = [] self._post_comment_connections = []
def parse_row(self, row): try: author = Author() author.author_type = self.parse_type(row['Risk']) if author.author_type == -1: return None author.author_full_name = unicode(row['AKA Name']).encode('ascii', 'ignore').decode('ascii') author.location = unicode(row['City'] + ' ' + row['Address']) author.geo_enabled = unicode(row['Location']) author.name=row['DBA Name'] author.created_at = unicode(row['Inspection Date']) return author except: return None
def _set_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'name' + author_guid author.name = u'name' + author_guid author.domain = u'test' self._db.add_author(author) self._author = author
def _add_author(self, author_guid): author = Author() author.author_guid = author_guid author.author_full_name = u'test author' author.name = u'test' author.domain = u'test' self._db.add_author(author) self._db.session.commit() self._authors.append(author)
def insert_suspended_accounts2(self): authors = [] author_screen_names = [] author_guids = [] author_guid_author_screen_name_tuples = self._db.get_missing_authors_tuples( ) author_guid_author_screen_name_tuples = list( author_guid_author_screen_name_tuples) num_of_suspended_accounts = len(author_guid_author_screen_name_tuples) for i, author_guid_author_screen_name_tuple in enumerate( author_guid_author_screen_name_tuples): msg = "\rInserting missing authors to authors table: {0}/{1}".format( i, num_of_suspended_accounts) print(msg, end="") author_guid = author_guid_author_screen_name_tuple[0] author_screen_name = author_guid_author_screen_name_tuple[1] if author_guid is None and author_screen_name is None: continue author = Author() author.author_screen_name = author_screen_name author.name = author_screen_name author_screen_names.append(author_screen_name) if author_guid is None: author_guid = compute_author_guid_by_author_name( author_screen_name) author.author_guid = author_guid author_guids.append(author_guid) author.author_type = u"bad_actor" author.domain = self._domain authors.append(author) self._db.add_authors(authors) with open(self._output_path + "insert_suspended_accounts.csv", 'w') as output_file: writer = csv.writer(output_file) writer.writerow( "Number of suspended_users_added_to_authors_table is: " + str(num_of_suspended_accounts)) author_screen_names_str = ','.join(author_screen_names) writer.writerow("author_screen_names: " + author_screen_names_str) author_guids_str = ','.join(author_guids) writer.writerow("author_guids: " + author_guids_str)
def fill_tweet_retweet_connection(self): ''' Fetches the original tweets being retweeted by our posts. Updates the followig tables: * Post_Citations table with tweet-retweet connection * Posts table with missing tweets * Authors with the authors of the missing tweets ''' retweets_with_no_tweet_citation = self._db.get_retweets_with_no_tweet_citation() logging.info("Updating tweet-retweet connection of {0} retweets".format(len(retweets_with_no_tweet_citation))) self._posts = [] self._authors = [] self._post_citatsions = [] i = 1 for post_guid, post_url in retweets_with_no_tweet_citation.iteritems(): # logging.info("Analyzing retweet: {0} - {1}".format(post_guid, post_url)) msg = "\r Analyzing retweet: {0} - {1} [{2}".format(post_guid, post_url, i) + "/" + str( len(retweets_with_no_tweet_citation)) + '] ' print(msg, end="") i += 1 tweet_data = self.extract_retweet_data(retweet_guid=post_guid, retweet_url=post_url) if tweet_data is not None: if not self._db.isPostExist(tweet_data.tweet_url): post = Post(guid=tweet_data.tweet_guid, post_id=tweet_data.tweet_guid, url=tweet_data.tweet_url, date=str_to_date(tweet_data.tweet_date), title=tweet_data.tweet_content, content=tweet_data.tweet_content, post_osn_id=tweet_data.tweet_twitter_id, retweet_count=tweet_data.tweet_retweet_count, favorite_count=tweet_data.tweet_favorite_count, author=tweet_data.tweet_author_name, author_guid=tweet_data.tweet_author_guid, domain=self._domain, original_tweet_importer_insertion_date=unicode(get_current_time_as_string())) self._posts.append(post) if not self._db.is_author_exists(tweet_data.tweet_author_guid, self._domain): author = Author(name=tweet_data.tweet_author_name, domain=self._domain, author_guid=tweet_data.tweet_author_guid, original_tweet_importer_insertion_date=unicode(get_current_time_as_string())) self._authors.append(author) if not self._db.is_post_citation_exist(tweet_data.retweet_guid, tweet_data.tweet_guid): post_citation = Post_citation(post_id_from=tweet_data.retweet_guid, post_id_to=tweet_data.tweet_guid, url_from=tweet_data.retweet_url, url_to=tweet_data.tweet_url) self._post_citatsions.append(post_citation) self.update_tables_with_tweet_retweet_data(self._posts, self._authors, self._post_citatsions)
def _convert_group_members_to_author(self, users_id_to_name_dict): """ :return: a list of Author objects ready to be added to DB. """ authors = [] for user_id in users_id_to_name_dict: author = Author() name_and_screen_name = users_id_to_name_dict[user_id] author.name = name_and_screen_name[0] author.author_screen_name = name_and_screen_name[1] author.author_osn_id = user_id author.author_guid = commons.compute_author_guid_by_osn_id( author.author_osn_id) author.domain = self._domain author.author_type = "User" authors.append(author) return authors
def parse_row(self, row): try: author = Author() author.author_type = self.parse_type(row['risk_category']) if author.author_type == -1: return None author.author_full_name = unicode(row['business_name']).encode('ascii', 'ignore').decode('ascii') author.name = unicode(author.author_full_name).encode('ascii', 'ignore').decode('ascii') author.location = unicode(row['business_city'] + ' ' + row['business_address']) author.geo_enabled = unicode(row['business_location']) if author.geo_enabled is None or author.geo_enabled == "": return None author.created_at = unicode(row['inspection_date']) return author except: logging.info("error with row:"+ str(row)) return None
def insert_suspended_accounts(self): authors = [] author_screen_names = [] author_guids = [] missing_author_posts = self._db.get_posts_of_missing_authors() num_of_missing_posts = len(missing_author_posts) for i, missing_author_post in enumerate(missing_author_posts): msg = "\rInserting missing authors to authors table: {0}/{1}".format( i, num_of_missing_posts) print(msg, end="") author = Author() author_screen_name = missing_author_post.author author.author_screen_name = author_screen_name author.name = author_screen_name author_screen_names.append(author_screen_name) author_guid = compute_author_guid_by_author_name( author_screen_name) author.author_guid = author_guid author_guids.append(author_guid) author.author_type = u"bad_actor" author.domain = self._domain authors.append(author) # update the missing guid to post missing_author_post.author_guid = author_guid self._db.add_authors(authors) self._db.addPosts(missing_author_posts) with open(self._output_path + "insert_suspended_accounts.txt", 'w') as output_file: output_file.write( "Number of suspended_users_added_to_authors_table is: " + str(num_of_missing_posts)) output_file.write("\n") author_screen_names_str = ','.join(author_screen_names) output_file.write("author_screen_names: " + author_screen_names_str) output_file.write("\n") author_guids_str = ','.join(author_guids) output_file.write("author_guids: " + author_guids_str)
def _convert_pages_to_authors(self, pages_id_to_name_dict): """ :return:a list of Author objects ready to be added to DB """ authors = [] for page_id in pages_id_to_name_dict: author = Author() author.name = pages_id_to_name_dict[page_id] author.author_osn_id = page_id author.author_guid = commons.compute_author_guid_by_osn_id( author.author_osn_id) author.domain = self._domain author.author_type = "Page" authors.append(author) return authors
def add_graph_to_db(cls, graph): post = Post(post_id=str(graph.graph['name']), domain='flickr', post_osn_id=str(graph.graph['name'])) post.post_type = 'labels' author_connections = [] for edge in graph.edges(): author_connections.append( AuthorConnection(source_author_guid=edge[0], destination_author_guid=edge[1], connection_type=graph.graph['name'])) authors = [] for node in graph.nodes(): authors.append( Author(name=str(node), domain=str(graph.graph['name']), author_guid=str(node))) cls._db.addPosts([post]) cls._db.addPosts(author_connections) cls._db.addPosts(authors)
def execute(self, window_start): author_screen_names_df = pd.read_csv(self._path_to_file) author_screen_names = author_screen_names_df[ 'author_screen_name'].tolist() authors = [] for i, author_screen_name in enumerate(author_screen_names): author = Author() msg = "\rCreate author: [{0}/{1}]".format(i, len(author_screen_names)) print(msg, end="") author.author_screen_name = author_screen_name author.name = author_screen_name author_guid = compute_author_guid_by_author_name( author_screen_name) author.author_guid = author_guid author.domain = self._domain authors.append(author) self._db.addPosts(authors)
def _json_user_to_db_author_converter(self, user, domain=u'Instagram_author'): author = Author() author.name = user['username'] author.author_screen_name = author.name author.author_guid = compute_author_guid_by_author_name(author.name) author.domain = domain author.author_type = domain author.author_osn_id = user['id'] author.author_full_name = user['full_name'] author.description = user.setdefault('biography', None) author.url = u'https://www.instagram.com/' + author.author_screen_name author.profile_image_url = user['profile_pic_url'] return author
def create_authors_for_deleting_tests(self): self.guid1 = '83d5812f-ff13-46d8-8c1c-3f17a48c239f' author = Author() author.name = 'author1' author.domain = str(self._domain) author.author_guid = str(self.guid1) author.author_type = 'bad_actor' author.author_sub_type = 'crowdturfer' self.db.add_author(author) self.guid2 = '08fffd68-52f9-45dd-a1ea-7c2a1b0206c4' author = Author() author.name = 'author2' author.domain = str(self._domain) author.author_guid = str(self.guid2) author.author_type = 'bad_actor' author.author_sub_type = None self.db.add_author(author) self.guid3 = 'a041d99d-7adc-47ad-a32b-ac24c1e43c03' author = Author() author.name = 'author3' author.domain = self._domain author.author_guid = self.guid3 author.author_type = 'bad_actor' author.author_sub_type = 'bot' self.db.add_author(author) self.guid4 = '06bc3c1b-0350-428f-b66c-7d476f442643' author = Author() author.name = 'author4' author.domain = self._domain author.author_guid = self.guid4 author.author_type = 'good_actor' author.author_sub_type = None self.db.add_author(author) self.guid5 = 'c5c1d938-1196-4bab-9f5e-23092c7be053' author = Author() author.name = 'author5' author.domain = self._domain author.author_guid = self.guid5 author.author_type = 'bad_actor' author.author_sub_type = 'acquired' self.db.add_author(author) self.db.session.commit()
def _create_author_by_row(self, record_dict): author = Author() author_osn_id = self._convert_to_unicode_value( record_dict["tumblog_id"]) author.author_osn_id = author_osn_id author.name = author_osn_id author.domain = self._domain author.author_guid = compute_author_guid_by_author_name(author.name) tumblr_blog_name = self._convert_to_unicode_value( record_dict["tumblr_blog_name"]) author.author_screen_name = tumblr_blog_name author.description = self._convert_to_unicode_value( record_dict["tumblr_blog_description"]) created_time_epoch = self._convert_to_unicode_value( record_dict["created_time_epoch"]) if created_time_epoch is not None: datetime, str_datetime = convert_epoch_timestamp_to_datetime( created_time_epoch) author.created_at = str_datetime else: author.created_at = self._set_start_date() author.url = self._convert_to_unicode_value( record_dict["tumblr_blog_url"]) author.protected = get_boolean_value(record_dict["is_private"]) author.time_zone = self._convert_to_unicode_value( record_dict["timezone"]) author.language = self._convert_to_unicode_value( record_dict["language"]) is_private = record_dict["is_private"] if is_private == "TRUE": author.protected = 1 else: author.protected = 0 return author
def setUp(self): self._db = DB() self._db.setUp() self.author_guid = u"author_guid" author = Author() author.author_guid = self.author_guid author.author_full_name = u'author' author.name = u'author_name' author.author_screen_name = u'author_screen_name' author.domain = u'Microblog' author.statuses_count = 10 author.friends_count = 5 author.followers_count = 6 author.favourites_count = 8 author.author_sub_type = u"bot" author.author_type = u"bad" author.created_at = u"2017-06-17 05:00:00" author.default_profile = True author.default_profile_image = True author.verified = True self._db.add_author(author) post = Post() post.author = self.author_guid post.author_guid = self.author_guid post.content = u"content" post.title = u"title" post.domain = u"domain" post.post_id = u"post_id" post.guid = post.post_id post.date = convert_str_to_unicode_datetime("2017-06-14 05:00:00") post.created_at = post.date self._db.addPost(post) self._db.session.commit() self.feature_prefix = u"AccountPropertiesFeatureGenerator_" self.account_properties_feature_generator = AccountPropertiesFeatureGenerator( self._db, **{ 'authors': [author], 'posts': { self.author_guid: [post] } }) self.account_properties_feature_generator.execute()
def commenter_to_author(self, commenter, photo_id): author = Author() author.name = str(commenter['authorname']) author.author_screen_name = str(commenter['path_alias']) author.author_full_name = str(commenter.get('realname', "")) author.url = str(commenter['permalink']) author.domain = str(photo_id) author.created_at = str(commenter["datecreate"]) author.author_osn_id = str(commenter['author']) author.author_guid = compute_author_guid_by_author_name( author.author_osn_id) author.author_type = 'comment' return author
def get_about_info_from_users(self): self._facebook_login() authors = [] for author_osn_id in self.osn_ids: author = Author() author.domain = self._domain author.author_osn_id = author_osn_id author.author_type = 'User' author.education = 'User Blocked' author.professional_skills = 'User Blocked' author.past_residence = 'User Blocked' author.birth_day = 'User Blocked' author.gender = 'User Blocked' author.gender = 'User Blocked' author.email = 'User Blocked' # Need to add the rest of the features with User Blocked as default. author.work = 'User Blocked' self.driver.get('https://www.facebook.com/' + author_osn_id) a_element = self.driver.find_element_by_xpath( "//a[@class='_2nlw _2nlv']") href_attribute = a_element.get_attribute('href') name = a_element.text # Extracting name unique_user_name = self._parse_unique_user_name(href_attribute) author.name = name author.author_screen_name = unique_user_name author.author_guid = commons.compute_author_guid_by_osn_id( author_osn_id) authors.append(author) self._get_about_info_for_authors(authors) self._db.addPosts(authors)
def setUp(self): self.config = getConfig() self._db = DB() self._db.setUp() self.timeline_overlap = TimelineOverlapVisualizationGenerator() author1 = Author() author1.name = 'acquired_user' author1.domain = 'Microblog' author1.author_guid = 'acquired_user' author1.author_screen_name = 'acquired_user' author1.author_full_name = 'acquired_user' author1.author_osn_id = 1 author1.created_at = datetime.datetime.now() author1.missing_data_complementor_insertion_date = datetime.datetime.now( ) author1.xml_importer_insertion_date = datetime.datetime.now() author1.author_type = 'bad_actor' author1.author_sub_type = 'acquired' self._db.add_author(author1) for i in range(1, 11): post1 = Post() post1.post_id = 'bad_post' + str(i) post1.author = 'acquired_user' post1.guid = 'bad_post' + str(i) post1.date = datetime.datetime.now() post1.domain = 'Microblog' post1.author_guid = 'acquired_user' post1.content = 'InternetTV love it' + str(i) post1.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post1) author = Author() author.name = 'TestUser1' author.domain = 'Microblog' author.author_guid = 'TestUser1' author.author_screen_name = 'TestUser1' author.author_full_name = 'TestUser1' author.author_osn_id = 2 author.created_at = datetime.datetime.now() author.missing_data_complementor_insertion_date = datetime.datetime.now( ) author.xml_importer_insertion_date = datetime.datetime.now() self._db.add_author(author) for i in range(1, 11): post = Post() post.post_id = 'TestPost' + str(i) post.author = 'TestUser1' post.guid = 'TestPost' + str(i) post.date = datetime.datetime.now() post.domain = 'Microblog' post.author_guid = 'TestUser1' post.content = 'InternetTV love it' + str(i) post.xml_importer_insertion_date = datetime.datetime.now() self._db.addPost(post) self._db.commit()