def logout(self): if self.logined: send_request(API_LOGOUT, method='get', session=self.session, headers=self.logined_headers, proxy=PROXY_GLOBAL) if not self.session.cookies.get_dict().get('sessionid'): logger.info('Logout successfully~')
def wrapper(self, username=None, **kwargs): API = APIS[opt] headers = self.logined_headers cookies = self.session.cookies.get_dict() headers['x-csrftoken'] = cookies['csrftoken'] data = _data if username and isinstance(username, str): if mode == 'user': target = self.get_user(username) url = API.format(userid=target.Id) elif mode == 'tag': url = API.format(tag=username) else: url = API if opt == 'set_filter': if kwargs.get('default'): data = {'config_value': '1'} logger.debug(f'set comments filter to default.') elif kwargs.get('keywords'): url = API_SET_COMMENT_FILTER_kEYWORDS k = kwargs['keywords'] _ = ','.join([str(i) for i in k ]) if len(k) > 1 else str(k[0]) + ',' data = {'keywords': _} logger.debug(f'set comments filter keywords to {k}.') send_request(API_SET_COMMENT_FILTER, session=self.session, headers=headers, method='post', data={'config_value': '0'}, proxy=PROXY_GLOBAL) else: data = {'config_value': '0'} logger.debug( f'set comments filter keywords not in default mode.') response = send_request(url, session=self.session, headers=headers, method='post', data=data, proxy=PROXY_GLOBAL) res = response.json() self.__opt__ = res if not username is None: ret = func(self, username) else: ret = func(self, **kwargs) self.__opt__ = None return ret
def get_homepage(self,reget=False): if reget or not self._homepage: response = send_request(self.url, headers=COMMON_HEADERS, proxy=PROXY_GLOBAL) self._homepage = response.text return self._homepage
def get_page_comments(self,shortcode,delay=DELAY,count=-1,save=False,path=None,tname=None): results = [] _count = 0 page = self.get_page_info(shortcode) comment_card = page['graphql']['shortcode_media']['edge_media_to_comment'] total = comment_card['count'] page_info = comment_card['page_info'] top_comments = comment_card['edges'] end_cursor = page_info['end_cursor'] has_next = page_info['has_next_page'] headers = COMMON_HEADERS headers['x-ig-app-id']=self.app_id headers['referer'] = API_PICTURE_PAGE.format(shortcode=shortcode) _check = count if count > 0 else total for i in top_comments: if save: self.db.save(i,tname=tname) results.append(i) _count += 1 if (_count >= count or _count >= total) and (count > 0): logger.info(f'[Done]Get crawled comments of page:"{shortcode}":{len(results)}.[Total({total})]') return results if not has_next: logger.info(f'[Done]Get crawled comments of page:"{shortcode}":{len(results)}.[Total({total})]') return results while 1: if not end_cursor: logger.info(f'[Done]Get crawled comments of page:"{shortcode}":{len(results)}.[Total({total})]') break params = copy.deepcopy(COMMENTS_PARAMS) params['query_hash']=self.comment_hash params['variables']=params['variables'].replace('$',end_cursor).replace('%',shortcode) md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, params=params, headers=headers, delay=delay, proxy=PROXY_GLOBAL, json=True) json_data = response.json() data = json_data['data']['shortcode_media']['edge_media_to_comment']['edges'] page_info = json_data['data']['shortcode_media']['edge_media_to_comment']['page_info'] for i in data: if save: self.db.save(i,tname=tname) results.append(i) _count += 1 if (_count >= count or _count >= total) and (count > 0): logger.info(f'[Done]Get crawled comments of page:"{shortcode}"' f':{len(results)}.[Total({total})]') return results logger.info(f'Current crawled comments of page "{shortcode}"' f':{len(results)}.[{round(len(results)/_check,4)*100 if _check else 0}%]') end_cursor = page_info['end_cursor'] if not page_info['has_next_page']: logger.info(f'[Done]Get crawled comments of page:"{shortcode}"' f':{len(results)}.[Total({total})]') break return results
def get_posts(self,delay=DELAY,count=-1,save=False,path=None,tname=None): _count = 0 results = [] _check = count if count > 0 else self.posts_count top_posts_card = self.info['edge_owner_to_timeline_media'] top_posts = top_posts_card['edges'] end_cursor = top_posts_card['page_info']['end_cursor'] posts_query_id = self.queryIds[2] headers = COMMON_HEADERS headers['x-ig-app-id']=self.app_id for i in top_posts: if save: self.db.save(i,tname=tname) _count += 1 results.append(i) if (_count >= count or _count >= self.posts_count) and (count > 0): logger.info(f'[Done]The length of crawled data of user "{self.name}"' f':{len(results)}.[Total({self.posts_count})]') return results logger.info(f'Total posts of user "{self.name}":{self.posts_count}.') while 1: if not end_cursor: logger.info(f'[Done]The length of crawled data of user "{self.name}"' f':{len(results)}.[Total({self.posts_count})]') break params = {} params['query_hash']=posts_query_id params['variables']=r'{"id":"'+self.Id+'","first":"'+\ str(USER_POSTS_MAX)+'","after":"'+end_cursor+'"}' md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, params=params, headers=headers, delay=delay, json=True, proxy=PROXY_GLOBAL) json_data = response.json() data = json_data['data']['user']\ ['edge_owner_to_timeline_media']['edges'] page_info = json_data['data']['user']\ ['edge_owner_to_timeline_media']['page_info'] for i in data: if save: self.db.save(i,tname=tname) results.append(i) _count += 1 if (_count >= count or _count >= self.posts_count) and (count > 0): logger.info(f'[Done]The length of crawled data of user "{self.name}"' f':{len(results)}.[Total({self.posts_count})]') return results logger.info(f'Current amount of posts of user "{self.name}"' f':{len(results)}.[{round(len(results)/_check,4)*100 if _check else 0}%]') end_cursor = page_info['end_cursor'] if not page_info['has_next_page']: logger.info(f'[Done]The length of crawled data of user "{self.name}"' f':{len(results)}.[Total({self.posts_count})]') break return results
def get_channel_posts(self,delay=DELAY,count=-1,save=False,path=None,tname=None): _count = 0 results = [] _check = count if count > 0 else self.channel_posts_count top_posts_card = self.info['edge_felix_video_timeline'] top_posts = top_posts_card['edges'] end_cursor = top_posts_card['page_info']['end_cursor'] headers = COMMON_HEADERS headers['x-ig-app-id'] = self.app_id for i in top_posts: if save: self.db.save(i,tname=tname) _count += 1 results.append(i) if (_count >= count or _count >= self.channel_posts_count) and (count > 0): logger.info(f'[Done]The amount of crawled channel posts data of user "{self.name}":{len(results)}.' f'[Total({self.channel_posts_count})]') return results logger.info(f'Total channel posts of user "{self.name}":{self.channel_posts_count}.') while 1: if not end_cursor: logger.info(f'[Done]The amount of crawled channel posts data of user "{self.name}":{len(results)}.' f'[Total({self.channel_posts_count})]') break params = copy.deepcopy(CHANNEL_PARAMS) params['variables'] = params['variables'].replace('%',self.Id).replace('$',end_cursor) params['query_hash'] = self.channel_hash md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, session=self.instagram.session, params=params, headers=headers, delay=delay, json=True, proxy=PROXY_GLOBAL) json_data = response.json() posts = json_data['data']['user']['edge_felix_video_timeline']['edges'] page_info = json_data['data']['user']['edge_felix_video_timeline']['page_info'] has_next_page = page_info['has_next_page'] end_cursor = page_info['end_cursor'] for i in posts: if save: self.db.save(i,tname=tname) results.append(i) _count += 1 if (_count >= count or _count >= self.channel_posts_count) and (count > 0): logger.info(f'[Done]The amount of crawled channel posts data of user "{self.name}"' f':{len(results)}.[Total({self.channel_posts_count})]') return results logger.info(f'Current amount of crawled channel posts data of user "{self.name}"' f':{len(results)}.[{round(len(results)/_check,4)*100 if _check else 0}%]') if not has_next_page: logger.info(f'[Done]The amount of crawled channel posts data of user "{self.name}"' f':{len(results)}.[Total({self.channel_posts_count})]') break return results
def get_query_hashs(self): js_url = HOST+from_pattern(self.homepage,PATTERN_QUERY_JS) response = send_request(js_url,proxy=PROXY_GLOBAL) self._queryHashs = list(from_pattern(response.text,PATTERN_FANS_FOLLOW)) pic_page_hashs = list(from_pattern(response.text,PATTERN_PICTURE_PAGE)) tag_hash = from_pattern(response.text,PATTERN_POSTS,allget=True) following_tag_hash = from_pattern(response.text,PATTERN_HASHTAG,allget=True) comment_liker_hash = from_pattern(response.text,PATTERN_LIKER,allget=True) self._queryHashs.extend(pic_page_hashs+tag_hash+following_tag_hash+comment_liker_hash) return self._queryHashs
def to_binary(self): if os.path.isfile(self.string): with open(self.string,'rb') as f: pic = f.read() elif w3lib.url.is_url(self.string): response = send_request(self.string) pic = response.content else: raise TypeError(f'Expected a url or disk path,got "{self.string}".') return pic
def get_media_likers(self,short_code,save=False,count=-1,delay=DELAY,tname=None,path=None): _count = 0 results = [] end_cursor = '' total = 0 _check = 0 while 1: params = copy.deepcopy(MEDIA_LIKER_PARAMS) headers = copy.deepcopy(COMMON_HEADERS) params['query_hash']=self.liker_hash params['variables'] = params['variables'].replace('$', short_code).replace('%', end_cursor) md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, json=True, delay=delay, headers=headers, params=params) data = response.json() liker_card = data['data']['shortcode_media']['edge_liked_by'] if _count==0: total = liker_card['count'] _check = count if count >0 else total logger.info(f'Total amount of users who liked media({short_code}) : {total}') likers = liker_card['edges'] page_info = liker_card['page_info'] end_cursor = page_info['end_cursor'] has_next_page = page_info['has_next_page'] logger.info(f'Current grabbed users who liked media({short_code}):{len(likers)}.[{round(len(results)/_check,4)*100}%]') for i in likers: _count += 1 results.append(i) if (_count >= count or _count >= total) and (count > 0): logger.info(f'[Done]Total crawled users who liked media({short_code}) :{len(results)}') return results if save: self.db.save(i, tname=tname) if not has_next_page: logger.info(f'[Done]Total crawled users who liked media({short_code}) :{len(results)}') return results
def get_tagged_posts(self,delay=DELAY,count=-1,save=False,path=None,tname=None): _count = 0 results = [] end_cursor = '' while 1: headers = COMMON_HEADERS params = copy.deepcopy(CHANNEL_PARAMS) params['variables'] = params['variables'].replace('%',self.Id).replace('$',end_cursor) params['query_hash'] = self.marked_id headers['x-ig-app-id'] = self.app_id md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, params=params, headers=headers, delay=delay, json=True, proxy=PROXY_GLOBAL) json_data = response.json() posts = json_data['data']['user']['edge_user_to_photos_of_you']['edges'] page_info = json_data['data']['user']['edge_user_to_photos_of_you']['page_info'] has_next_page = page_info['has_next_page'] end_cursor = page_info['end_cursor'] for i in posts: if save: self.db.save(i,tname=tname) results.append(i) _count += 1 if _count >= count and count > 0: logger.info(f'[Done]The amount of crawled tagged posts by user "{self.name}":{len(results)}.') return results logger.info(f'Current amount of crawled tagged posts by user "{self.name}":{len(results)}.') if not has_next_page: logger.info(f'[Done]The amount of crawled tagged posts by user "{self.name}":{len(results)}.') break return results
def get_comment_likers(self,comment_id,save=False,count=-1,delay=DELAY,tname=None,path=None): _count = 0 results = [] end_cursor = '' while 1: params = copy.deepcopy(COMMENT_LIKER_PARAMS) headers = copy.deepcopy(COMMON_HEADERS) params['query_hash'] = self.comment_liker_hash params['variables'] = params['variables'].replace('$', comment_id).replace('%', end_cursor) md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis'] = md5ed response = send_request(API_USER_POSTS, session=self.session, json=True, delay=delay, headers=headers, params=params) data = response.json() liker_card = data['data']['comment']['edge_liked_by'] likers = liker_card['edges'] page_info = liker_card['page_info'] end_cursor = page_info['end_cursor'] has_next_page = page_info['has_next_page'] logger.info( f'Current grabbed users who liked comment({comment_id}):{len(likers)}.') for i in likers: _count += 1 results.append(i) if _count >= count and (count > 0): logger.info(f'[Done]Total crawled users who liked comment({comment_id}) :{len(results)}') return results if save: self.db.save(i, tname=tname) if not has_next_page: logger.info(f'[Done]Total crawled users who liked comment({comment_id}) :{len(results)}') return results
def get_channel_hash(self): js_url = HOST+from_pattern(self.homepage,PATTERN_APP_ID_JS) response = send_request(js_url,proxy=PROXY_GLOBAL) self._channel_hash = from_pattern(response.text,PATTERN_CHANNEL) return self._channel_hash
def get_liker_hash(self): js_url = HOST + from_pattern(self.homepage, PATTERN_APP_ID_JS) response = send_request(js_url, proxy=PROXY_GLOBAL) self._liker_hash = from_pattern(response.text,PATTERN_LIKER) return self._liker_hash
def get_web_app_id(self): js_url = HOST+from_pattern(self.homepage,PATTERN_APP_ID_JS) response = send_request(js_url,proxy=PROXY_GLOBAL) self._web_app_id = from_pattern(response.text,PATTERN_WEB_APP_ID) return self._web_app_id
def get_posts_by_tag(self,tag,delay=DELAY,top_only=True,count=-1,save=False,tname=None,path=None): url = API_TAG_POSTS.format(tag=tag) response = send_request(url,json=True) data = response.json() hashtags = data['graphql']['hashtag'] media_posts = hashtags['edge_hashtag_to_media'] top_posts = hashtags['edge_hashtag_to_top_posts']['edges'] total = media_posts['count'] current_posts = media_posts['edges'] page_info = media_posts['page_info'] end_cursor = page_info['end_cursor'] has_next_page = page_info['has_next_page'] results = [] _count = 0 _check = count if count > 0 else total headers=COMMON_HEADERS headers['x-ig-app-id']=self.app_id logger.info(f'Total posts of tag "{tag}":{total}') if top_only: for i in top_posts: if save: self.db.save(i,tname=tname) return top_posts else: for i in current_posts: _count+=1 results.append(i) if (_count>=count or _count>=total) and (count>0): logger.info(f'[Done]Total crawled posts of tag "{tag}":{len(results)}') return results if save: self.db.save(i,tname=tname) while 1: if not has_next_page: return results params = copy.deepcopy(TAG_PARAMS) params['query_hash']=self.tag_hash params['variables']=params['variables'].replace('$',tag).replace('%',end_cursor) md5ed = md5(self.rhx_gis + ":" + params['variables']) headers['x-instagram-gis']=md5ed response = send_request(API_USER_POSTS, params=params, delay=delay, headers=headers, json=True) data = response.json() hashtags = data['data']['hashtag'] media_posts = hashtags['edge_hashtag_to_media'] current_posts = media_posts['edges'] page_info = media_posts['page_info'] end_cursor = page_info['end_cursor'] has_next_page = page_info['has_next_page'] logger.info(f'Amount of current crawled posts of tag "{tag}"' f':{len(results)}.[{round(len(results)/_check,4)*100 if _check else 0}%]') for i in current_posts: _count+=1 results.append(i) if (_count>=count or _count>=total) and (count>0): logger.info(f'[Done]Total crawled posts of tag "{tag}":{len(results)}') return results if save: self.db.save(i,tname=tname)
def get_query_ids(self): js_url = HOST+from_pattern(self.homepage,PATTERN_POSTS_JS) response = send_request(js_url,proxy=PROXY_GLOBAL) self._queryIds = from_pattern(response.text,PATTERN_POSTS,allget=True) return self._queryIds
def download_image(self,url,path=None): with open(path,'wb') as f: response = send_request(url) f.write(response.content)
def wrapper(self, *args, **kwargs): url = APIS[opt] if api is None else api if login: headers = self.logined_headers cookies = self.session.cookies.get_dict() headers['x-csrftoken'] = cookies['csrftoken'] else: headers = copy.deepcopy(COMMON_HEADERS) if not produce: response = send_request(url, session=self.session, headers=headers, method=method, data=data, params=params, json=True, delay=DELAY, proxy=PROXY_GLOBAL) res = response.json() self.__opt__ = res ret = func(self, *args, **kwargs) else: ret = True data_dict, tips = func(self, *args, **kwargs) url = data_dict.get('url') if data_dict.get('url') else url headers.update(data_dict.get('headers', {})) if opt == 'create_post': headers.pop('Content-Type') headers.pop('content-length') response = send_request(url, session=self.session, headers=headers, method=method, params=data_dict.get('params', None), data=data_dict.get('data', None), proxy=PROXY_GLOBAL, json=True, delay=DELAY, **data_dict.get('http_kwargs', {})) if response is None: return res = response.json() if callback and callable(callback): cb_args = data_dict.get('cb_kwargs', {}) return callback(self, res, **cb_args) if res and (res.get('status', '') == 'ok' or res.get('graphql')): if opt == 'reset_password': self.pwd = data_dict['data']['new_password1'] if opt == 'upload_pic': if not res.get('has_profile_pic'): if data_dict['data']: if not res.get('upload_id'): logger.info(tips['failed']) else: logger.info(tips['ok']) return res.get('upload_id') else: logger.info(tips['ok']) return ret logger.info(tips['ok']) if opt == 'create_post': logger.info( f"Posted media id:{res.get('media').get('pk')}") return res else: logger.info(tips['failed']) logger.info(f"error:{res['message']}") ret = False self.__opt__ = None if out: return res return ret