def from_dict(cls, attr_dict: dict) -> 'Media': attr_dict['id'] = int(attr_dict['id']) attr_dict['owner'] = InstaUser.from_dict(attr_dict['owner']) attr_dict['comments_amount'] = attr_dict['edge_media_to_comment']['count'] attr_dict['likes_amount'] = attr_dict['edge_media_preview_like']['count'] attr_dict['taken_at_timestamp'] = datetime.fromtimestamp(attr_dict['taken_at_timestamp']) attr_dict['taggees'] = [InstaUser.from_dict(obj['node']['user']) for obj in attr_dict['edge_media_to_tagged_user']['edges']] return super().from_dict(attr_dict)
def find_mutual_event_type( self, mysql: MySQLHelper, from_date: Optional[datetime], to_date: Optional[datetime], mutual_event_timeframe_days: int, days_back: Optional[int] = None) -> Set[MutualFollowEvent]: """Find users that have both followed each other or unfollowed each other :param from_date: from_date to query from :param to_date: to_date to query :param days_back: If given, ignore from_date/to_date :param mutual_event_timeframe_days: Maximum amount of days for it to be considered a mutual event type For example, mutual unfollow is only if they have both unfollowed each other in the past 2 days. """ ts_filter_1, params_1 = self.build_ts_filter(from_date, to_date, days_back, ts_col="fe1.ts") ts_filter_2, params_2 = self.build_ts_filter(from_date, to_date, days_back, ts_col="fe2.ts") ts_filter = "({}) and ({})".format(ts_filter_1, ts_filter_2) params = params_1 + params_2 sql = """ select fe1.src_user_name as user_name_1, fe1.src_user_id as user_id_1, fe2.src_user_name as user_name_2, fe2.src_user_id as user_id_2, fe1.ts as user_1_event_ts, fe2.ts as user_2_event_ts, fe1.follow_type_id as follow_type_id, abs(timestampdiff(day, fe1.ts, fe2.ts)) as day_diff from follow_events fe1 join follow_events fe2 on fe1.dst_user_id = fe2.src_user_id and fe1.src_user_id = fe2.dst_user_id and fe1.follow_type_id = fe2.follow_type_id where {ts_filter} and abs(timestampdiff(day, fe1.ts, fe2.ts)) < ? """.format(ts_filter=ts_filter) params.append(mutual_event_timeframe_days) mutual_events_records = mysql.query(sql, params) events = set() for row in mutual_events_records: mutual_event = MutualFollowEvent( UserEvent(InstaUser(row.user_id_1, row.user_name_1), row.user_1_event_ts, row.follow_type_id), UserEvent(InstaUser(row.user_id_2, row.user_name_2), row.user_2_event_ts, row.follow_type_id)) events.add(mutual_event) return events
def scrape_follow_type(self, driver: WebDriver, user_name: str, query_hash: str) -> Set[InstaUser]: self.init_driver() self.logger.info('Scraping follow type') driver.get("http://www.instagram.com/{0}".format(user_name)) data_key = 'edge_followed_by' if query_hash == QueryHashes.FOLLOWERS else 'edge_follow' validation_func = partial(self.is_valid_body, data_key=data_key) user_id = self.parse_user_id_from_profile(driver) request_url = self.create_url(query_hash, user_id) all_users = set() # type: Set[InstaUser] while True: body = self.get_url_data(request_url, validation_func) try: data = json.loads(body)['data']['user'][data_key] end_cursor = data['page_info']['end_cursor'] if data[ 'page_info']['has_next_page'] else None users = { InstaUser.from_dict(user['node']) for user in data['edges'] } except Exception as e: self.logger.exception("driver body: {0}".format( driver.find_element_by_tag_name('body').text)) raise e self.logger.info('Currently scraped %d users', len(users)) all_users.update(users) if end_cursor is None: break request_url = self.create_url(query_hash, user_id, end_cursor) self.logger.info('Done scraping users. found %d users', len(all_users)) return all_users
def scrape_media_likers( self, media: Media, scrape_id: Optional[str] = None, scrape_ts: Optional[datetime] = None, batch_size: Optional[int] = DEFAULT_BATCH_SIZE, max_likers_amount: Optional[int] = None) -> LikersScraping: """ :param media: Media object for which we scrape its likers :param batch_size: Amount of users to request in every API request (limits to 50) :param max_likers_amount: Once this amount has been scraped, finish. If not set, will scrape everyone :return: """ self.init_driver() scrape_id = scrape_id or str(uuid.uuid4()) scrape_ts = scrape_ts or datetime.now() self.logger.info('Scraping likers (scrape id %s) for media %s at %s', scrape_id, media.id, media.display_url) self.to_home_page() request_url = self.create_url(QueryHashes.MEDIA_LIKES, media.shortcode, batch_size=batch_size) all_users = {} # type: Dict[int, Story] while True: body = self.get_url_data(request_url, self._is_failed_response, MAX_ALLOWED_RETRIES, RATE_LIMIT_REACHED_WAIT_SECONDS) self.logger.info("Parsing likers from page...") try: data = json.loads( body)['data']['shortcode_media']['edge_liked_by'] except Exception as e: self.logger.exception("Failed loading json. body: %s", body) raise e likers_objects = [ media_object['node'] for media_object in data['edges'] ] next_cursor = data['page_info'] for item in likers_objects: liker = InstaUser.from_dict(item) if liker.user_id not in all_users: all_users[liker.user_id] = liker self.logger.info("Parsed %d liker objects", len(likers_objects)) if next_cursor['has_next_page'] is False or ( max_likers_amount is not None and len(all_users) >= max_likers_amount): break next_cursor = next_cursor['end_cursor'] request_url = self.create_url(QueryHashes.MEDIA_LIKES, media.shortcode, next_cursor, batch_size) return LikersScraping(list(all_users.values()), scrape_id, scrape_ts)
def get_current_follows( self, mysql: MySQLHelper, user: str, cursor: Optional[Cursor] = None) -> Optional[UserFollows]: res = mysql.query( "select * from {0} where src_user_name = ?".format( self.FOLLOWS_TABLE), [user], cursor) followers = set() follows = set() if len(res) == 0: return None for r in res: if r.dst_follows: followers.add(InstaUser(r.dst_user_id, r.dst_user_name)) if r.src_follows: follows.add(InstaUser(r.dst_user_id, r.dst_user_name)) return UserFollows( InstaUser(res[0].src_user_id, res[0].src_user_name, res[0].src_user_name), followers, follows)
def get_users(self, group_name: str, mysql: MySQLHelper, limit: Optional[int] = None) -> List[InstaUser]: """Gets users to scrape it's media objects. Ordered by ascending last_scrape_ts So it will start parsing users we haven't scraped lately """ self.logger.debug("Getting users for group %s", group_name) query = self.GET_USERS_QUERY if limit is not None: query += " limit {}".format(limit) params = [group_name] res = mysql.query(query, params) users = [InstaUser(row.user_id, row.user_name) for row in res] self.logger.debug("Done querying users") return users
def scrape_user(self, user_name: str, driver: Optional[WebDriver] = None, max_retries: int = 100, wait_seconds: int = 150) -> Optional[InstaUser]: """Scrapes InstaUser data. Returns None if user doesn't exist""" self.logger.info('Scraping user data for %s', user_name) if driver is None: self.init_driver() driver = self.driver body = self.get_url_data('{0}/{1}/?__a=1'.format(self.INSTA_URL, user_name), lambda x: not ('minutes' in x and len(x) <= 1000), max_retries, wait_seconds) try: user_data = json.loads(body)['graphql']['user'] return InstaUser.from_dict(user_data) except KeyError as e: return None except JSONDecodeError as e: return None except Exception as e: self.logger.error("Failed parsing following: %s", driver.find_element_by_tag_name('body').text) raise e
def from_dict_and_rank(cls, attr_dict: dict, rank: int): user = InstaUser.from_dict(attr_dict) return RankedUser.from_user_and_rank(user, rank)
def from_dict(cls, attr_dict: dict) -> 'UserFollows': user = InstaUser.from_dict(attr_dict['user']) followers = {InstaUser.from_dict(x) for x in attr_dict['followers']} follows = {InstaUser.from_dict(x) for x in attr_dict['follows']} return UserFollows(user, followers, follows)