예제 #1
0
 def from_dict(cls, attr_dict: dict) -> 'Media':
     attr_dict['id'] = int(attr_dict['id'])
     attr_dict['owner'] = InstaUser.from_dict(attr_dict['owner'])
     attr_dict['comments_amount'] = attr_dict['edge_media_to_comment']['count']
     attr_dict['likes_amount'] = attr_dict['edge_media_preview_like']['count']
     attr_dict['taken_at_timestamp'] = datetime.fromtimestamp(attr_dict['taken_at_timestamp'])
     attr_dict['taggees'] = [InstaUser.from_dict(obj['node']['user']) for obj in
                             attr_dict['edge_media_to_tagged_user']['edges']]
     return super().from_dict(attr_dict)
    def find_mutual_event_type(
            self,
            mysql: MySQLHelper,
            from_date: Optional[datetime],
            to_date: Optional[datetime],
            mutual_event_timeframe_days: int,
            days_back: Optional[int] = None) -> Set[MutualFollowEvent]:
        """Find users that have both followed each other or unfollowed each other

        :param from_date: from_date to query from
        :param to_date: to_date to query
        :param days_back: If given, ignore from_date/to_date
        :param mutual_event_timeframe_days: Maximum amount of days for it to be considered a mutual event type
                                            For example, mutual unfollow is only if they have both unfollowed each other
                                            in the past 2 days.
        """
        ts_filter_1, params_1 = self.build_ts_filter(from_date,
                                                     to_date,
                                                     days_back,
                                                     ts_col="fe1.ts")
        ts_filter_2, params_2 = self.build_ts_filter(from_date,
                                                     to_date,
                                                     days_back,
                                                     ts_col="fe2.ts")
        ts_filter = "({}) and ({})".format(ts_filter_1, ts_filter_2)
        params = params_1 + params_2
        sql = """
                select fe1.src_user_name as user_name_1,
                       fe1.src_user_id as user_id_1,
                       fe2.src_user_name as user_name_2,
                       fe2.src_user_id as user_id_2,
                       fe1.ts as user_1_event_ts,
                       fe2.ts as user_2_event_ts,
                       fe1.follow_type_id as follow_type_id,
                       abs(timestampdiff(day, fe1.ts, fe2.ts)) as day_diff
                from follow_events fe1
                         join follow_events fe2 on fe1.dst_user_id = fe2.src_user_id
                    and fe1.src_user_id = fe2.dst_user_id and fe1.follow_type_id = fe2.follow_type_id
                where {ts_filter} and abs(timestampdiff(day, fe1.ts, fe2.ts)) < ?
        """.format(ts_filter=ts_filter)

        params.append(mutual_event_timeframe_days)
        mutual_events_records = mysql.query(sql, params)
        events = set()
        for row in mutual_events_records:
            mutual_event = MutualFollowEvent(
                UserEvent(InstaUser(row.user_id_1, row.user_name_1),
                          row.user_1_event_ts, row.follow_type_id),
                UserEvent(InstaUser(row.user_id_2, row.user_name_2),
                          row.user_2_event_ts, row.follow_type_id))
            events.add(mutual_event)
        return events
예제 #3
0
 def scrape_follow_type(self, driver: WebDriver, user_name: str,
                        query_hash: str) -> Set[InstaUser]:
     self.init_driver()
     self.logger.info('Scraping follow type')
     driver.get("http://www.instagram.com/{0}".format(user_name))
     data_key = 'edge_followed_by' if query_hash == QueryHashes.FOLLOWERS else 'edge_follow'
     validation_func = partial(self.is_valid_body, data_key=data_key)
     user_id = self.parse_user_id_from_profile(driver)
     request_url = self.create_url(query_hash, user_id)
     all_users = set()  # type: Set[InstaUser]
     while True:
         body = self.get_url_data(request_url, validation_func)
         try:
             data = json.loads(body)['data']['user'][data_key]
             end_cursor = data['page_info']['end_cursor'] if data[
                 'page_info']['has_next_page'] else None
             users = {
                 InstaUser.from_dict(user['node'])
                 for user in data['edges']
             }
         except Exception as e:
             self.logger.exception("driver body: {0}".format(
                 driver.find_element_by_tag_name('body').text))
             raise e
         self.logger.info('Currently scraped %d users', len(users))
         all_users.update(users)
         if end_cursor is None:
             break
         request_url = self.create_url(query_hash, user_id, end_cursor)
     self.logger.info('Done scraping users. found %d users', len(all_users))
     return all_users
예제 #4
0
    def scrape_media_likers(
            self,
            media: Media,
            scrape_id: Optional[str] = None,
            scrape_ts: Optional[datetime] = None,
            batch_size: Optional[int] = DEFAULT_BATCH_SIZE,
            max_likers_amount: Optional[int] = None) -> LikersScraping:
        """
        :param media: Media object for which we scrape its likers
        :param batch_size: Amount of users to request in every API request (limits to 50)
        :param max_likers_amount: Once this amount has been scraped, finish. If not set, will scrape everyone
        :return:
        """
        self.init_driver()
        scrape_id = scrape_id or str(uuid.uuid4())
        scrape_ts = scrape_ts or datetime.now()
        self.logger.info('Scraping likers (scrape id %s) for media %s at %s',
                         scrape_id, media.id, media.display_url)
        self.to_home_page()
        request_url = self.create_url(QueryHashes.MEDIA_LIKES,
                                      media.shortcode,
                                      batch_size=batch_size)
        all_users = {}  # type: Dict[int, Story]
        while True:
            body = self.get_url_data(request_url, self._is_failed_response,
                                     MAX_ALLOWED_RETRIES,
                                     RATE_LIMIT_REACHED_WAIT_SECONDS)
            self.logger.info("Parsing likers from page...")

            try:
                data = json.loads(
                    body)['data']['shortcode_media']['edge_liked_by']
            except Exception as e:
                self.logger.exception("Failed loading json. body: %s", body)
                raise e
            likers_objects = [
                media_object['node'] for media_object in data['edges']
            ]
            next_cursor = data['page_info']

            for item in likers_objects:
                liker = InstaUser.from_dict(item)
                if liker.user_id not in all_users:
                    all_users[liker.user_id] = liker
            self.logger.info("Parsed %d liker objects", len(likers_objects))

            if next_cursor['has_next_page'] is False or (
                    max_likers_amount is not None
                    and len(all_users) >= max_likers_amount):
                break
            next_cursor = next_cursor['end_cursor']
            request_url = self.create_url(QueryHashes.MEDIA_LIKES,
                                          media.shortcode, next_cursor,
                                          batch_size)

        return LikersScraping(list(all_users.values()), scrape_id, scrape_ts)
예제 #5
0
 def get_current_follows(
         self,
         mysql: MySQLHelper,
         user: str,
         cursor: Optional[Cursor] = None) -> Optional[UserFollows]:
     res = mysql.query(
         "select * from {0} where src_user_name = ?".format(
             self.FOLLOWS_TABLE), [user], cursor)
     followers = set()
     follows = set()
     if len(res) == 0:
         return None
     for r in res:
         if r.dst_follows:
             followers.add(InstaUser(r.dst_user_id, r.dst_user_name))
         if r.src_follows:
             follows.add(InstaUser(r.dst_user_id, r.dst_user_name))
     return UserFollows(
         InstaUser(res[0].src_user_id, res[0].src_user_name,
                   res[0].src_user_name), followers, follows)
 def get_users(self,
               group_name: str,
               mysql: MySQLHelper,
               limit: Optional[int] = None) -> List[InstaUser]:
     """Gets users to scrape it's media objects. Ordered by ascending last_scrape_ts
     So it will start parsing users we haven't scraped lately
     """
     self.logger.debug("Getting users for group %s", group_name)
     query = self.GET_USERS_QUERY
     if limit is not None:
         query += " limit {}".format(limit)
     params = [group_name]
     res = mysql.query(query, params)
     users = [InstaUser(row.user_id, row.user_name) for row in res]
     self.logger.debug("Done querying users")
     return users
예제 #7
0
 def scrape_user(self, user_name: str, driver: Optional[WebDriver] = None,
                 max_retries: int = 100, wait_seconds: int = 150) -> Optional[InstaUser]:
     """Scrapes InstaUser data. Returns None if user doesn't exist"""
     self.logger.info('Scraping user data for %s', user_name)
     if driver is None:
         self.init_driver()
         driver = self.driver
     body = self.get_url_data('{0}/{1}/?__a=1'.format(self.INSTA_URL, user_name), lambda x: not ('minutes' in x and len(x) <= 1000),
                              max_retries, wait_seconds)
     try:
         user_data = json.loads(body)['graphql']['user']
         return InstaUser.from_dict(user_data)
     except KeyError as e:
         return None
     except JSONDecodeError as e:
         return None
     except Exception as e:
         self.logger.error("Failed parsing following: %s", driver.find_element_by_tag_name('body').text)
         raise e
 def from_dict_and_rank(cls, attr_dict: dict, rank: int):
     user = InstaUser.from_dict(attr_dict)
     return RankedUser.from_user_and_rank(user, rank)
예제 #9
0
 def from_dict(cls, attr_dict: dict) -> 'UserFollows':
     user = InstaUser.from_dict(attr_dict['user'])
     followers = {InstaUser.from_dict(x) for x in attr_dict['followers']}
     follows = {InstaUser.from_dict(x) for x in attr_dict['follows']}
     return UserFollows(user, followers, follows)