def save_results(self, scrape_result: StoryScraping, mysql_helper: MySQLHelper): story_records = [] viewer_records = [] cursor = mysql_helper.get_cursor() for story in scrape_result.stories: story_record = StoryRecord( story.story_id, story.story_owner_id, story.story_owner_user_name, story.display_url, datetime.fromtimestamp(story.taken_at_timestamp), datetime.fromtimestamp(story.expiring_at_timestamp)) story_records.append(story_record) for viewer in story.viewers: v_record = ViewerRecord(scrape_result.scrape_id, scrape_result.scrape_ts, story.story_id, story.story_owner_id, story.story_owner_user_name, story.story_view_count, viewer.user_id, viewer.username, viewer.rank) viewer_records.append(v_record) story_cnt = mysql_helper.insert_ignore(self.STORIES_TABLE, story_records, cursor) viewer_cnt = mysql_helper.insert_on_duplicate_update( self.VIEWERS_TABLE, viewer_records, cursor) return story_cnt, viewer_cnt
def save_results(self, scrape_result: MediaLikersScraping, mysql_helper: MySQLHelper): """Persist media records, taggees and media interactions. Returns tuple with counts of inserted records""" media_records = [] media_interactions_records = [] taggees_records = [] cursor = mysql_helper.get_cursor() for media in scrape_result.media: media_records.append( MediaRecord(scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.media_type, media.taken_at_timestamp, media.owner.username, media.owner.user_id, media.display_url, media.comments_amount, media.likes_amount, len(media.taggees))) taggees_records.extend([TaggeeRecord(scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.media_type, media.owner.user_id, media.owner.username, media.taken_at_timestamp, taggee.user_id, taggee.username) for taggee in media.taggees]) for liker in media.likers: media_interactions_records.append(MediaLikeRecord( scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.owner.user_id, media.owner.username, liker.user_id, liker.username )) for commenter in media.comments: media_interactions_records.append(MediaCommentRecord( scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.owner.user_id, media.owner.username, commenter.user_id, commenter.username )) media_cnt = taggees_cnt = operations_cnt = 0 if len(media_records) > 0: media_cnt = mysql_helper.insert_on_duplicate_update(self.MEDIA_TABLE, media_records, cursor) if len(taggees_records) > 0: taggees_cnt = mysql_helper.insert_on_duplicate_update(self.TAGGEES_TABLE, taggees_records, cursor) if len(media_interactions_records) > 0: operations_cnt = mysql_helper.insert_on_duplicate_update(self.MEDIA_INTERACTIONS_TABLE, media_interactions_records, cursor) return media_cnt, taggees_cnt, operations_cnt
def insert_raw_followers(self, mysql: MySQLHelper, cursor: Cursor, unfollowers: Set[InstaUser], src_user: InstaUser, scrape_ts: datetime): """Insert raw records to table""" self.logger.info("Insert raw follows...") records = [ RawFollowRecord(src_user.user_id, src_user.username, user.user_id, user.username, scrape_ts) for user in unfollowers ] mysql.insert(self.RAW_FOLLOWS_TABLE, records, cursor) self.logger.info("Done updating raw table records")
def main(self, group_name: str, scrape_follows: bool = True, scrape_followers: bool = True, limit_users: Optional[int] = None, max_follows_to_scrape_amount: Optional[int] = 500): mysql = MySQLHelper('mysql-insta-local') users = self.get_users(group_name, mysql, max_follows=max_follows_to_scrape_amount, limit=limit_users) self.logger.info("Found %d users for group %s", len(users), group_name) core_scraper = UserFollowsScraper() scraper = UserFollowsAudit() for user in users: self.logger.info("Handling user %s", user.username) try: scraper.main(user.username, scrape_follows=scrape_follows, scrape_followers=scrape_followers, scraper=core_scraper, max_follow_amount=max_follows_to_scrape_amount) except UserDoesNotExist as e: self.logger.warning("User %s does not exist. skip.", user.username) self.logger.info("Done handling user %s", user.username) self.logger.info("Done scraping group follows")
def find_trending_event_type( self, mysql: MySQLHelper, from_date: Optional[datetime], to_date: Optional[datetime], days_back: Optional[int] = None) -> List[TrendingFollowEvent]: """Find users that have both followed each other or unfollowed each other :param from_date: from_date to query from :param to_date: to_date to query :param days_back: If given, ignore from_date/to_date """ ts_filter, params = self.build_ts_filter(from_date, to_date, days_back) sql = """ select dst_user_name, follow_type_id, min(ts) as first_ts, max(ts) as last_ts, count(*) cnt, group_concat(src_user_name) as users from follow_events where {ts_filter} group by dst_user_name, follow_type_id having count(*) > 1 order by cnt desc; """.format(ts_filter=ts_filter) trending_events_records = mysql.query(sql, params) events = [ TrendingFollowEvent(row.users.split(','), row.dst_user_name, row.first_ts, row.last_ts, row.follow_type_id, row.cnt) for row in trending_events_records ] return events
def _update_agg_unfollowers(self, mysql: MySQLHelper, cursor: Cursor, unfollowers: Set[InstaUser], src_user: str, scrape_ts: datetime, follow_side: str): """Update agg table with unfollowers""" self.logger.info("Updating unfollowers in agg table (found %d)", len(unfollowers)) ids = ', '.join( ["'{0}'".format(dst_user.user_id) for dst_user in unfollowers]) params = [scrape_ts, src_user] sql = """ UPDATE {table} SET {side}_unfollows_latest_timestamp = ?, {side}_follows = 0 WHERE src_user_name = ? and {side}_user_id in ({ids}) """.format(table=self.FOLLOWS_TABLE, ids=ids, side=follow_side) mysql.execute(sql, params, cursor) self.logger.info("Done updating agg table records")
def update_agg_followers(self, mysql: MySQLHelper, cursor: Cursor, follows: List[UserFollows], analyzed: (List[dict], Dict[str, InstaUser]), users: Dict[int, InstaUser], unfollowers: Set[InstaUser], scrape_ts: datetime): self.logger.info("Inserting following records into agg table...") records = [] src_user = follows[0].user for f in analyzed: dst_user = users[f['dst_id']] unfollow_ts = scrape_ts if dst_user in unfollowers else None records.append( FollowRecord(src_user.user_id, src_user.username, dst_user.user_id, dst_user.username, f['src_follows'], scrape_ts, scrape_ts, f['dst_follows'], scrape_ts, scrape_ts, unfollow_ts)) mysql.insert_on_duplicate_update(self.FOLLOWS_TABLE, records, cursor) self.logger.info("done insert follows to agg table")
def main(self): mysql = MySQLHelper('mysql-insta-local') scraper = StoryViewersScraper() scrape_result = scraper.scrape_viewers() self.save_results(scrape_result, mysql) mysql.commit() mysql.close()
def find_mutual_event_type( self, mysql: MySQLHelper, from_date: Optional[datetime], to_date: Optional[datetime], mutual_event_timeframe_days: int, days_back: Optional[int] = None) -> Set[MutualFollowEvent]: """Find users that have both followed each other or unfollowed each other :param from_date: from_date to query from :param to_date: to_date to query :param days_back: If given, ignore from_date/to_date :param mutual_event_timeframe_days: Maximum amount of days for it to be considered a mutual event type For example, mutual unfollow is only if they have both unfollowed each other in the past 2 days. """ ts_filter_1, params_1 = self.build_ts_filter(from_date, to_date, days_back, ts_col="fe1.ts") ts_filter_2, params_2 = self.build_ts_filter(from_date, to_date, days_back, ts_col="fe2.ts") ts_filter = "({}) and ({})".format(ts_filter_1, ts_filter_2) params = params_1 + params_2 sql = """ select fe1.src_user_name as user_name_1, fe1.src_user_id as user_id_1, fe2.src_user_name as user_name_2, fe2.src_user_id as user_id_2, fe1.ts as user_1_event_ts, fe2.ts as user_2_event_ts, fe1.follow_type_id as follow_type_id, abs(timestampdiff(day, fe1.ts, fe2.ts)) as day_diff from follow_events fe1 join follow_events fe2 on fe1.dst_user_id = fe2.src_user_id and fe1.src_user_id = fe2.dst_user_id and fe1.follow_type_id = fe2.follow_type_id where {ts_filter} and abs(timestampdiff(day, fe1.ts, fe2.ts)) < ? """.format(ts_filter=ts_filter) params.append(mutual_event_timeframe_days) mutual_events_records = mysql.query(sql, params) events = set() for row in mutual_events_records: mutual_event = MutualFollowEvent( UserEvent(InstaUser(row.user_id_1, row.user_name_1), row.user_1_event_ts, row.follow_type_id), UserEvent(InstaUser(row.user_id_2, row.user_name_2), row.user_2_event_ts, row.follow_type_id)) events.add(mutual_event) return events
def get_new_media(self, mysql: MySQLHelper, from_date: Optional[datetime], to_date: Optional[datetime], days_back: Optional[int]): assert from_date is not None or to_date is not None or days_back is not None ts_filter, ts_params = self.build_ts_filter(from_date, to_date, days_back, ts_col="taken_at_ts") query = """ select * from media where {ts_filter} order by scrape_ts desc, taken_at_ts asc """.format(ts_filter=ts_filter) records = mysql.query(query, ts_params) media_records = [MediaRecord.from_row(record) for record in records] return media_records
def get_users(self, group_name: str, mysql: MySQLHelper, limit: Optional[int] = None) -> List[InstaUser]: """Gets users to scrape it's media objects. Ordered by ascending last_scrape_ts So it will start parsing users we haven't scraped lately """ self.logger.debug("Getting users for group %s", group_name) query = self.GET_USERS_QUERY if limit is not None: query += " limit {}".format(limit) params = [group_name] res = mysql.query(query, params) users = [InstaUser(row.user_id, row.user_name) for row in res] self.logger.debug("Done querying users") return users
def main(): with open('/opt/InstaProfiler/logs/user-follows.log') as fp: txt = fp.read() matches = LOG_RECORD_REGEX.findall(txt) all_queries = [] unfollow_users_distinct = set() for match in matches: if 'UPDATE follows' in match[5]: unfollow_params = UNFOLLOW_PARAMS_REGEX.search(match[5]) ts_params = [ unfollow_params.group('year'), unfollow_params.group('month'), unfollow_params.group('day'), unfollow_params.group('hour'), unfollow_params.group('minute'), unfollow_params.group('second'), unfollow_params.group('frac') ] unfollow_ts = datetime(*[int(x) for x in ts_params]) src_user = unfollow_params.group('src_user') unfollow_users = UNFOLLOW_USERS_REGEX.search( match[5]).group('users').split(', ') if len(unfollow_users) < 8: for u in unfollow_users: unfollow_users_distinct.add(u.strip("'")) query = "UPDATE follows set dst_follows=0, dst_unfollows_latest_timestamp=? where src_user_name=? and dst_user_id in ({users})".format( users=','.join(unfollow_users)) params = (unfollow_ts, src_user) print(query) all_queries.append((query, params)) else: print("Too much users", len(unfollow_users)) print(','.join(unfollow_users_distinct)) odbc_helper = MySQLHelper('mysql-insta-local') cursor = odbc_helper.get_cursor() for query in all_queries: odbc_helper.execute(query[0], query[1], cursor) odbc_helper.commit() odbc_helper.close()
def main(self, from_date: Optional[str] = None, to_date: Optional[str] = None, days_back: Optional[int] = None, mutual_event_timeframe_days: int = DEFAULT_MUTUAL_TIMEFRAME_DAYS): assert from_date is not None or to_date is not None or days_back is not None if to_date is not None: to_date = datetime.strptime(to_date, DATE_FORMAT) if from_date is not None: from_date = datetime.strptime(from_date, DATE_FORMAT) mysql = MySQLHelper('mysql-insta-local') mutual_follow_events, mutual_unfollow_events = self.get_mutual_follow_events( mysql, from_date, to_date, mutual_event_timeframe_days, days_back) trending_follow_events, trending_unfollow_events = self.get_trending_follow_events( mysql, from_date, to_date, days_back) self.logger.info( "found %d mutual follow events and %d mutual unfollow events", len(mutual_follow_events), len(mutual_unfollow_events)) self.logger.info( "found %d trending follow events and %d trending unfollow events", len(trending_follow_events), len(trending_unfollow_events)) if len(mutual_follow_events) > 0 or len( mutual_unfollow_events) > 0 or len( trending_follow_events) > 0 or len( trending_unfollow_events) > 0: msg = self.prepare_mail(mutual_follow_events, mutual_unfollow_events, trending_follow_events, trending_unfollow_events) self.logger.info("Exporting email") exporter = SendGridExporter() resp = exporter.send_email('*****@*****.**', ['*****@*****.**'], 'גללתי ומצאתי - אחד על אחד', html_content=msg) self.logger.info("Done exporting.") else: self.logger.info("No data to send by mail")
def get_current_follows( self, mysql: MySQLHelper, user: str, cursor: Optional[Cursor] = None) -> Optional[UserFollows]: res = mysql.query( "select * from {0} where src_user_name = ?".format( self.FOLLOWS_TABLE), [user], cursor) followers = set() follows = set() if len(res) == 0: return None for r in res: if r.dst_follows: followers.add(InstaUser(r.dst_user_id, r.dst_user_name)) if r.src_follows: follows.add(InstaUser(r.dst_user_id, r.dst_user_name)) return UserFollows( InstaUser(res[0].src_user_id, res[0].src_user_name, res[0].src_user_name), followers, follows)
def main(self, from_date: Optional[str] = None, to_date: Optional[str] = None, days_back: Optional[int] = None): assert from_date is not None or to_date is not None or days_back is not None if to_date is not None: to_date = datetime.strptime(to_date, DATE_FORMAT) if from_date is not None: from_date = datetime.strptime(from_date, DATE_FORMAT) mysql = MySQLHelper('mysql-insta-local') new_media = self.get_new_media(mysql, from_date, to_date, days_back) self.logger.info("%d new media objects were found", len(new_media)) if len(new_media) >0: msg = self.prepare_mail(new_media) self.logger.info("Exporting email") exporter = SendGridExporter() resp = exporter.send_email('*****@*****.**', ['*****@*****.**'], 'גללתי ומצאתי - פוסטים חדשים', html_content=msg) self.logger.info("Done exporting.") else: self.logger.info("No data to send.")
def main(self, user: InstaUser, scraper: Optional[MediaScraper] = None, scrape_likers: bool = False, scrape_comments: bool = False, likers_scrape_threshold: Optional[int] = None, media_max_likers_amount: Optional[int] = None, max_media_limit: Optional[int] = None): scraper = scraper or MediaScraper() if not user.from_full_profile: user = scraper.scrape_user(user.username) if user.is_private and not user.followed_by_viewer: self.logger.warning("user is private and not followed by viewer. skipping scraping...") else: mysql = MySQLHelper('mysql-insta-local') scrape_result = scraper.scrape_media(user, scrape_likers, scrape_comments, likers_scrape_threshold, media_max_likers_amount, max_media_limit) self.save_results(scrape_result, mysql) mysql.commit() mysql.close()
def main(self, group_name: str, limit_users: Optional[int] = None, max_media_to_scrape_amount: Optional[int] = 500, scrape_likers: bool = False, scrape_comments: bool = False, likers_threshold: Optional[int] = None, media_likers_limit: Optional[int] = None): mysql = MySQLHelper('mysql-insta-local') users = self.get_users(group_name, mysql, limit=limit_users) self.logger.info("Found %d users for group %s", len(users), group_name) core_scraper = MediaScraper() scraper = MediaLikersAudit() for user in users: self.logger.info("Handling user %s", user.username) try: scraper.main(user, core_scraper, scrape_likers, scrape_comments, likers_threshold, media_likers_limit, max_media_to_scrape_amount) except UserDoesNotExist as e: self.logger.warning("User %s does not exist. skip.", user.username) self.logger.info("Done handling user %s", user.username) self.logger.info("Done scraping group media")
def persist_user(self, mysql: MySQLHelper, cursor: Cursor, user: InstaUser, scrape_ts: datetime): self.logger.debug("persisting user to mysql...") user_record = InstaUserRecord.from_insta_user(scrape_ts, user) mysql.insert_on_duplicate_update(self.USER_INFO_TABLE, [user_record], cursor)
def main(self, user: Union[InstaUser, str] = DEFAULT_USER_NAME, only_mutual: bool = False, scrape_follows: bool = True, scrape_followers: bool = True, max_follow_amount: Optional[int] = None, scraper: Optional[UserFollowsScraper] = None): """ :param user: User to parse its follows. :param only_mutual: If set to True, will save only people that are both followers and follows. Useful when src has many followers. This will make sure only "relevant" people are saved :param scrape_follows: If given, will only scrape user's follow :param scrape_followers: If given, will only scrape user's followers :param max_follow_amount: If given, will only scrape follows (follows/followers apart) if amount is under max_follow_amount :return: """ scraper = scraper or UserFollowsScraper() if isinstance(user, str): user = scraper.scrape_user(user) if user is None: raise UserDoesNotExist() if scrape_followers and max_follow_amount is not None and user.followed_by_amount > max_follow_amount: self.logger.warning( "user is followed by too many people (followed by %d, max %d), skipping followers...", user.followed_by_amount, max_follow_amount) scrape_followers = False if scrape_follows and max_follow_amount is not None and user.follows_amount > max_follow_amount: self.logger.warning( "user follows too many people (follows %d, max %d), skipping follows...", user.follows_amount, max_follow_amount) scrape_follows = False mysql = MySQLHelper('mysql-insta-local') cursor = mysql.get_cursor() if user.is_private and not user.followed_by_viewer: self.logger.warning( "user is private and not followed by viewer. skipping scraping..." ) scrape_ts = datetime.now() else: follow_scrape = scraper.parse_user_follows([user], scrape_follows, scrape_followers) follows, scrape_id, scrape_ts = follow_scrape.follows, follow_scrape.scrape_id, follow_scrape.scrape_ts current_follows = self.get_current_follows(mysql, user.username, cursor) analyzed, users = UserFollowsAnalyzer.analyze_follows( follows, only_mutual) # Update unfollowers if scrape_follows: # No followers are scraped so they could all be considered as unfollowed dst_has_unfollowed = set( ) if current_follows is None else current_follows.followers.difference( follows[0].followers) if len(dst_has_unfollowed) > 0: self.update_agg_dst_unfollowers(mysql, cursor, dst_has_unfollowed, user.username, scrape_ts) src_has_unfollowed = set( ) if current_follows is None else current_follows.follows.difference( follows[0].follows) if len(src_has_unfollowed): self.handle_unfollowers(mysql, cursor, src_has_unfollowed, user, scrape_ts) # Insert new records if sum(len(x.follows) for x in follows) > 0: self.update_agg_followers(mysql, cursor, follows, analyzed, users, src_has_unfollowed, scrape_ts) new_follows = follows[ 0].follows if current_follows is None else follows[ 0].follows.difference(current_follows.follows) if len(new_follows) > 0: self.insert_raw_followers(mysql, cursor, new_follows, user, scrape_ts) # Update user info self.persist_user(mysql, cursor, user, scrape_ts) mysql.commit() cursor.close() mysql.close() self.logger.info("Done UserFollowsScraper main")