def save_results(self, scrape_result: StoryScraping,
                     mysql_helper: MySQLHelper):
        story_records = []
        viewer_records = []
        cursor = mysql_helper.get_cursor()
        for story in scrape_result.stories:
            story_record = StoryRecord(
                story.story_id, story.story_owner_id,
                story.story_owner_user_name, story.display_url,
                datetime.fromtimestamp(story.taken_at_timestamp),
                datetime.fromtimestamp(story.expiring_at_timestamp))
            story_records.append(story_record)
            for viewer in story.viewers:
                v_record = ViewerRecord(scrape_result.scrape_id,
                                        scrape_result.scrape_ts,
                                        story.story_id, story.story_owner_id,
                                        story.story_owner_user_name,
                                        story.story_view_count, viewer.user_id,
                                        viewer.username, viewer.rank)
                viewer_records.append(v_record)

        story_cnt = mysql_helper.insert_ignore(self.STORIES_TABLE,
                                               story_records, cursor)
        viewer_cnt = mysql_helper.insert_on_duplicate_update(
            self.VIEWERS_TABLE, viewer_records, cursor)
        return story_cnt, viewer_cnt
 def save_results(self, scrape_result: MediaLikersScraping, mysql_helper: MySQLHelper):
     """Persist media records, taggees and media interactions. Returns tuple with counts of inserted records"""
     media_records = []
     media_interactions_records = []
     taggees_records = []
     cursor = mysql_helper.get_cursor()
     for media in scrape_result.media:
         media_records.append(
             MediaRecord(scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.media_type,
                         media.taken_at_timestamp, media.owner.username, media.owner.user_id,
                         media.display_url, media.comments_amount, media.likes_amount, len(media.taggees)))
         taggees_records.extend([TaggeeRecord(scrape_result.scrape_id, scrape_result.scrape_ts, media.id,
                                              media.media_type, media.owner.user_id, media.owner.username,
                                              media.taken_at_timestamp, taggee.user_id, taggee.username) for taggee
                                 in media.taggees])
         for liker in media.likers:
             media_interactions_records.append(MediaLikeRecord(
                 scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.owner.user_id,
                 media.owner.username, liker.user_id, liker.username
             ))
         for commenter in media.comments:
             media_interactions_records.append(MediaCommentRecord(
                 scrape_result.scrape_id, scrape_result.scrape_ts, media.id, media.owner.user_id,
                 media.owner.username, commenter.user_id, commenter.username
             ))
     media_cnt = taggees_cnt = operations_cnt = 0
     if len(media_records) > 0:
         media_cnt = mysql_helper.insert_on_duplicate_update(self.MEDIA_TABLE, media_records, cursor)
     if len(taggees_records) > 0:
         taggees_cnt = mysql_helper.insert_on_duplicate_update(self.TAGGEES_TABLE, taggees_records, cursor)
     if len(media_interactions_records) > 0:
         operations_cnt = mysql_helper.insert_on_duplicate_update(self.MEDIA_INTERACTIONS_TABLE,
                                                                  media_interactions_records, cursor)
     return media_cnt, taggees_cnt, operations_cnt
Beispiel #3
0
 def insert_raw_followers(self, mysql: MySQLHelper, cursor: Cursor,
                          unfollowers: Set[InstaUser], src_user: InstaUser,
                          scrape_ts: datetime):
     """Insert raw records to table"""
     self.logger.info("Insert raw follows...")
     records = [
         RawFollowRecord(src_user.user_id, src_user.username, user.user_id,
                         user.username, scrape_ts) for user in unfollowers
     ]
     mysql.insert(self.RAW_FOLLOWS_TABLE, records, cursor)
     self.logger.info("Done updating raw table records")
 def main(self,
          group_name: str,
          scrape_follows: bool = True,
          scrape_followers: bool = True,
          limit_users: Optional[int] = None,
          max_follows_to_scrape_amount: Optional[int] = 500):
     mysql = MySQLHelper('mysql-insta-local')
     users = self.get_users(group_name,
                            mysql,
                            max_follows=max_follows_to_scrape_amount,
                            limit=limit_users)
     self.logger.info("Found %d users for group %s", len(users), group_name)
     core_scraper = UserFollowsScraper()
     scraper = UserFollowsAudit()
     for user in users:
         self.logger.info("Handling user %s", user.username)
         try:
             scraper.main(user.username,
                          scrape_follows=scrape_follows,
                          scrape_followers=scrape_followers,
                          scraper=core_scraper,
                          max_follow_amount=max_follows_to_scrape_amount)
         except UserDoesNotExist as e:
             self.logger.warning("User %s does not exist. skip.",
                                 user.username)
         self.logger.info("Done handling user %s", user.username)
     self.logger.info("Done scraping group follows")
    def find_trending_event_type(
            self,
            mysql: MySQLHelper,
            from_date: Optional[datetime],
            to_date: Optional[datetime],
            days_back: Optional[int] = None) -> List[TrendingFollowEvent]:
        """Find users that have both followed each other or unfollowed each other

        :param from_date: from_date to query from
        :param to_date: to_date to query
        :param days_back: If given, ignore from_date/to_date
        """
        ts_filter, params = self.build_ts_filter(from_date, to_date, days_back)
        sql = """
            select dst_user_name,
                   follow_type_id,
                   min(ts)                     as first_ts,
                   max(ts)                     as last_ts,
                   count(*)                       cnt,
                   group_concat(src_user_name) as users
            from follow_events
            where {ts_filter}
            group by dst_user_name, follow_type_id
            having count(*) > 1
            order by cnt desc;
        """.format(ts_filter=ts_filter)

        trending_events_records = mysql.query(sql, params)
        events = [
            TrendingFollowEvent(row.users.split(','), row.dst_user_name,
                                row.first_ts, row.last_ts, row.follow_type_id,
                                row.cnt) for row in trending_events_records
        ]
        return events
Beispiel #6
0
 def _update_agg_unfollowers(self, mysql: MySQLHelper, cursor: Cursor,
                             unfollowers: Set[InstaUser], src_user: str,
                             scrape_ts: datetime, follow_side: str):
     """Update agg table with unfollowers"""
     self.logger.info("Updating unfollowers in agg table (found %d)",
                      len(unfollowers))
     ids = ', '.join(
         ["'{0}'".format(dst_user.user_id) for dst_user in unfollowers])
     params = [scrape_ts, src_user]
     sql = """
                             UPDATE {table}
                             SET {side}_unfollows_latest_timestamp = ?, {side}_follows = 0
                             WHERE src_user_name = ? and {side}_user_id in ({ids})
                         """.format(table=self.FOLLOWS_TABLE,
                                    ids=ids,
                                    side=follow_side)
     mysql.execute(sql, params, cursor)
     self.logger.info("Done updating agg table records")
Beispiel #7
0
    def update_agg_followers(self, mysql: MySQLHelper, cursor: Cursor,
                             follows: List[UserFollows],
                             analyzed: (List[dict], Dict[str, InstaUser]),
                             users: Dict[int, InstaUser],
                             unfollowers: Set[InstaUser], scrape_ts: datetime):
        self.logger.info("Inserting following records into agg table...")
        records = []
        src_user = follows[0].user
        for f in analyzed:
            dst_user = users[f['dst_id']]
            unfollow_ts = scrape_ts if dst_user in unfollowers else None
            records.append(
                FollowRecord(src_user.user_id, src_user.username,
                             dst_user.user_id, dst_user.username,
                             f['src_follows'], scrape_ts, scrape_ts,
                             f['dst_follows'], scrape_ts, scrape_ts,
                             unfollow_ts))

        mysql.insert_on_duplicate_update(self.FOLLOWS_TABLE, records, cursor)
        self.logger.info("done insert follows to agg table")
 def main(self):
     mysql = MySQLHelper('mysql-insta-local')
     scraper = StoryViewersScraper()
     scrape_result = scraper.scrape_viewers()
     self.save_results(scrape_result, mysql)
     mysql.commit()
     mysql.close()
    def find_mutual_event_type(
            self,
            mysql: MySQLHelper,
            from_date: Optional[datetime],
            to_date: Optional[datetime],
            mutual_event_timeframe_days: int,
            days_back: Optional[int] = None) -> Set[MutualFollowEvent]:
        """Find users that have both followed each other or unfollowed each other

        :param from_date: from_date to query from
        :param to_date: to_date to query
        :param days_back: If given, ignore from_date/to_date
        :param mutual_event_timeframe_days: Maximum amount of days for it to be considered a mutual event type
                                            For example, mutual unfollow is only if they have both unfollowed each other
                                            in the past 2 days.
        """
        ts_filter_1, params_1 = self.build_ts_filter(from_date,
                                                     to_date,
                                                     days_back,
                                                     ts_col="fe1.ts")
        ts_filter_2, params_2 = self.build_ts_filter(from_date,
                                                     to_date,
                                                     days_back,
                                                     ts_col="fe2.ts")
        ts_filter = "({}) and ({})".format(ts_filter_1, ts_filter_2)
        params = params_1 + params_2
        sql = """
                select fe1.src_user_name as user_name_1,
                       fe1.src_user_id as user_id_1,
                       fe2.src_user_name as user_name_2,
                       fe2.src_user_id as user_id_2,
                       fe1.ts as user_1_event_ts,
                       fe2.ts as user_2_event_ts,
                       fe1.follow_type_id as follow_type_id,
                       abs(timestampdiff(day, fe1.ts, fe2.ts)) as day_diff
                from follow_events fe1
                         join follow_events fe2 on fe1.dst_user_id = fe2.src_user_id
                    and fe1.src_user_id = fe2.dst_user_id and fe1.follow_type_id = fe2.follow_type_id
                where {ts_filter} and abs(timestampdiff(day, fe1.ts, fe2.ts)) < ?
        """.format(ts_filter=ts_filter)

        params.append(mutual_event_timeframe_days)
        mutual_events_records = mysql.query(sql, params)
        events = set()
        for row in mutual_events_records:
            mutual_event = MutualFollowEvent(
                UserEvent(InstaUser(row.user_id_1, row.user_name_1),
                          row.user_1_event_ts, row.follow_type_id),
                UserEvent(InstaUser(row.user_id_2, row.user_name_2),
                          row.user_2_event_ts, row.follow_type_id))
            events.add(mutual_event)
        return events
Beispiel #10
0
 def get_new_media(self, mysql: MySQLHelper, from_date: Optional[datetime], to_date: Optional[datetime],
                   days_back: Optional[int]):
     assert from_date is not None or to_date is not None or days_back is not None
     ts_filter, ts_params = self.build_ts_filter(from_date, to_date, days_back, ts_col="taken_at_ts")
     query = """
     select *
     from media
     where {ts_filter}
     order by scrape_ts desc, taken_at_ts asc
     """.format(ts_filter=ts_filter)
     records = mysql.query(query, ts_params)
     media_records = [MediaRecord.from_row(record) for record in records]
     return media_records
 def get_users(self,
               group_name: str,
               mysql: MySQLHelper,
               limit: Optional[int] = None) -> List[InstaUser]:
     """Gets users to scrape it's media objects. Ordered by ascending last_scrape_ts
     So it will start parsing users we haven't scraped lately
     """
     self.logger.debug("Getting users for group %s", group_name)
     query = self.GET_USERS_QUERY
     if limit is not None:
         query += " limit {}".format(limit)
     params = [group_name]
     res = mysql.query(query, params)
     users = [InstaUser(row.user_id, row.user_name) for row in res]
     self.logger.debug("Done querying users")
     return users
Beispiel #12
0
    def main():
        with open('/opt/InstaProfiler/logs/user-follows.log') as fp:
            txt = fp.read()
        matches = LOG_RECORD_REGEX.findall(txt)
        all_queries = []
        unfollow_users_distinct = set()
        for match in matches:
            if 'UPDATE follows' in match[5]:
                unfollow_params = UNFOLLOW_PARAMS_REGEX.search(match[5])
                ts_params = [
                    unfollow_params.group('year'),
                    unfollow_params.group('month'),
                    unfollow_params.group('day'),
                    unfollow_params.group('hour'),
                    unfollow_params.group('minute'),
                    unfollow_params.group('second'),
                    unfollow_params.group('frac')
                ]
                unfollow_ts = datetime(*[int(x) for x in ts_params])
                src_user = unfollow_params.group('src_user')
                unfollow_users = UNFOLLOW_USERS_REGEX.search(
                    match[5]).group('users').split(', ')
                if len(unfollow_users) < 8:
                    for u in unfollow_users:
                        unfollow_users_distinct.add(u.strip("'"))
                    query = "UPDATE follows set dst_follows=0, dst_unfollows_latest_timestamp=? where src_user_name=? and dst_user_id in ({users})".format(
                        users=','.join(unfollow_users))
                    params = (unfollow_ts, src_user)
                    print(query)
                    all_queries.append((query, params))
                else:
                    print("Too much users", len(unfollow_users))

        print(','.join(unfollow_users_distinct))
        odbc_helper = MySQLHelper('mysql-insta-local')
        cursor = odbc_helper.get_cursor()
        for query in all_queries:
            odbc_helper.execute(query[0], query[1], cursor)
        odbc_helper.commit()
        odbc_helper.close()
    def main(self,
             from_date: Optional[str] = None,
             to_date: Optional[str] = None,
             days_back: Optional[int] = None,
             mutual_event_timeframe_days: int = DEFAULT_MUTUAL_TIMEFRAME_DAYS):
        assert from_date is not None or to_date is not None or days_back is not None
        if to_date is not None:
            to_date = datetime.strptime(to_date, DATE_FORMAT)
        if from_date is not None:
            from_date = datetime.strptime(from_date, DATE_FORMAT)
        mysql = MySQLHelper('mysql-insta-local')

        mutual_follow_events, mutual_unfollow_events = self.get_mutual_follow_events(
            mysql, from_date, to_date, mutual_event_timeframe_days, days_back)

        trending_follow_events, trending_unfollow_events = self.get_trending_follow_events(
            mysql, from_date, to_date, days_back)
        self.logger.info(
            "found %d mutual follow events and %d mutual unfollow events",
            len(mutual_follow_events), len(mutual_unfollow_events))
        self.logger.info(
            "found %d trending follow events and %d trending unfollow events",
            len(trending_follow_events), len(trending_unfollow_events))

        if len(mutual_follow_events) > 0 or len(
                mutual_unfollow_events) > 0 or len(
                    trending_follow_events) > 0 or len(
                        trending_unfollow_events) > 0:
            msg = self.prepare_mail(mutual_follow_events,
                                    mutual_unfollow_events,
                                    trending_follow_events,
                                    trending_unfollow_events)

            self.logger.info("Exporting email")
            exporter = SendGridExporter()
            resp = exporter.send_email('*****@*****.**',
                                       ['*****@*****.**'],
                                       'גללתי ומצאתי - אחד על אחד',
                                       html_content=msg)

            self.logger.info("Done exporting.")
        else:
            self.logger.info("No data to send by mail")
Beispiel #14
0
 def get_current_follows(
         self,
         mysql: MySQLHelper,
         user: str,
         cursor: Optional[Cursor] = None) -> Optional[UserFollows]:
     res = mysql.query(
         "select * from {0} where src_user_name = ?".format(
             self.FOLLOWS_TABLE), [user], cursor)
     followers = set()
     follows = set()
     if len(res) == 0:
         return None
     for r in res:
         if r.dst_follows:
             followers.add(InstaUser(r.dst_user_id, r.dst_user_name))
         if r.src_follows:
             follows.add(InstaUser(r.dst_user_id, r.dst_user_name))
     return UserFollows(
         InstaUser(res[0].src_user_id, res[0].src_user_name,
                   res[0].src_user_name), followers, follows)
Beispiel #15
0
    def main(self, from_date: Optional[str] = None, to_date: Optional[str] = None, days_back: Optional[int] = None):
        assert from_date is not None or to_date is not None or days_back is not None
        if to_date is not None:
            to_date = datetime.strptime(to_date, DATE_FORMAT)
        if from_date is not None:
            from_date = datetime.strptime(from_date, DATE_FORMAT)
        mysql = MySQLHelper('mysql-insta-local')

        new_media = self.get_new_media(mysql, from_date, to_date, days_back)
        self.logger.info("%d new media objects were found", len(new_media))
        if len(new_media) >0:
            msg = self.prepare_mail(new_media)

            self.logger.info("Exporting email")
            exporter = SendGridExporter()
            resp = exporter.send_email('*****@*****.**', ['*****@*****.**'], 'גללתי ומצאתי - פוסטים חדשים',
                                       html_content=msg)

            self.logger.info("Done exporting.")
        else:
            self.logger.info("No data to send.")
Beispiel #16
0
    def main(self, user: InstaUser, scraper: Optional[MediaScraper] = None, scrape_likers: bool = False,
             scrape_comments: bool = False, likers_scrape_threshold: Optional[int] = None,
             media_max_likers_amount: Optional[int] = None, max_media_limit: Optional[int] = None):
        scraper = scraper or MediaScraper()

        if not user.from_full_profile:
            user = scraper.scrape_user(user.username)

        if user.is_private and not user.followed_by_viewer:
            self.logger.warning("user is private and not followed by viewer. skipping scraping...")
        else:
            mysql = MySQLHelper('mysql-insta-local')
            scrape_result = scraper.scrape_media(user, scrape_likers, scrape_comments, likers_scrape_threshold,
                                                 media_max_likers_amount, max_media_limit)
            self.save_results(scrape_result, mysql)
            mysql.commit()
            mysql.close()
 def main(self,
          group_name: str,
          limit_users: Optional[int] = None,
          max_media_to_scrape_amount: Optional[int] = 500,
          scrape_likers: bool = False,
          scrape_comments: bool = False,
          likers_threshold: Optional[int] = None,
          media_likers_limit: Optional[int] = None):
     mysql = MySQLHelper('mysql-insta-local')
     users = self.get_users(group_name, mysql, limit=limit_users)
     self.logger.info("Found %d users for group %s", len(users), group_name)
     core_scraper = MediaScraper()
     scraper = MediaLikersAudit()
     for user in users:
         self.logger.info("Handling user %s", user.username)
         try:
             scraper.main(user, core_scraper, scrape_likers,
                          scrape_comments, likers_threshold,
                          media_likers_limit, max_media_to_scrape_amount)
         except UserDoesNotExist as e:
             self.logger.warning("User %s does not exist. skip.",
                                 user.username)
         self.logger.info("Done handling user %s", user.username)
     self.logger.info("Done scraping group media")
Beispiel #18
0
 def persist_user(self, mysql: MySQLHelper, cursor: Cursor, user: InstaUser,
                  scrape_ts: datetime):
     self.logger.debug("persisting user to mysql...")
     user_record = InstaUserRecord.from_insta_user(scrape_ts, user)
     mysql.insert_on_duplicate_update(self.USER_INFO_TABLE, [user_record],
                                      cursor)
Beispiel #19
0
    def main(self,
             user: Union[InstaUser, str] = DEFAULT_USER_NAME,
             only_mutual: bool = False,
             scrape_follows: bool = True,
             scrape_followers: bool = True,
             max_follow_amount: Optional[int] = None,
             scraper: Optional[UserFollowsScraper] = None):
        """
        :param user: User to parse its follows.
        :param only_mutual: If set to True, will save only people that are both followers and follows.
                            Useful when src has many followers. This will make sure only "relevant" people are saved
        :param scrape_follows: If given, will only scrape user's follow
        :param scrape_followers: If given, will only scrape user's followers
        :param max_follow_amount: If given, will only scrape follows (follows/followers apart) if amount is
                                  under max_follow_amount
        :return:
        """
        scraper = scraper or UserFollowsScraper()
        if isinstance(user, str):
            user = scraper.scrape_user(user)

        if user is None:
            raise UserDoesNotExist()
        if scrape_followers and max_follow_amount is not None and user.followed_by_amount > max_follow_amount:
            self.logger.warning(
                "user is followed by too many people (followed by %d, max %d), skipping followers...",
                user.followed_by_amount, max_follow_amount)
            scrape_followers = False

        if scrape_follows and max_follow_amount is not None and user.follows_amount > max_follow_amount:
            self.logger.warning(
                "user follows too many people (follows %d, max %d), skipping follows...",
                user.follows_amount, max_follow_amount)
            scrape_follows = False

        mysql = MySQLHelper('mysql-insta-local')
        cursor = mysql.get_cursor()

        if user.is_private and not user.followed_by_viewer:
            self.logger.warning(
                "user is private and not followed by viewer. skipping scraping..."
            )
            scrape_ts = datetime.now()
        else:
            follow_scrape = scraper.parse_user_follows([user], scrape_follows,
                                                       scrape_followers)
            follows, scrape_id, scrape_ts = follow_scrape.follows, follow_scrape.scrape_id, follow_scrape.scrape_ts

            current_follows = self.get_current_follows(mysql, user.username,
                                                       cursor)
            analyzed, users = UserFollowsAnalyzer.analyze_follows(
                follows, only_mutual)

            # Update unfollowers
            if scrape_follows:
                # No followers are scraped so they could all be considered as unfollowed
                dst_has_unfollowed = set(
                ) if current_follows is None else current_follows.followers.difference(
                    follows[0].followers)
                if len(dst_has_unfollowed) > 0:
                    self.update_agg_dst_unfollowers(mysql, cursor,
                                                    dst_has_unfollowed,
                                                    user.username, scrape_ts)

            src_has_unfollowed = set(
            ) if current_follows is None else current_follows.follows.difference(
                follows[0].follows)
            if len(src_has_unfollowed):
                self.handle_unfollowers(mysql, cursor, src_has_unfollowed,
                                        user, scrape_ts)

            # Insert new records
            if sum(len(x.follows) for x in follows) > 0:
                self.update_agg_followers(mysql, cursor, follows, analyzed,
                                          users, src_has_unfollowed, scrape_ts)
                new_follows = follows[
                    0].follows if current_follows is None else follows[
                        0].follows.difference(current_follows.follows)
                if len(new_follows) > 0:
                    self.insert_raw_followers(mysql, cursor, new_follows, user,
                                              scrape_ts)

        # Update user info
        self.persist_user(mysql, cursor, user, scrape_ts)

        mysql.commit()
        cursor.close()
        mysql.close()
        self.logger.info("Done UserFollowsScraper main")