def scrapeVideos(username = "", password = "", output_folder = "", days = 1): print("Starting Scraping") L = instaloader.Instaloader() # Login or load session for loader L.login(username, password) profile = instaloader.Profile.from_username(L.context, username) following = profile.get_followees() print(following) for profile in following: acc = profile.username looter = ProfileLooter(acc, videos_only=True, template="{id}-{username}-{width}-{height}") if not looter.logged_in(): looter.login(username, password) print("Scraping From Account: " + acc) today = datetime.date.today() timeframe = (today, today - dateutil.relativedelta.relativedelta(days=days)) numDowloaded = looter.download(output_folder, media_count=30, timeframe=timeframe) print("Downloaded " + str(numDowloaded) + " videos successfully") print("")
class TestLogin(unittest.TestCase): def setUp(self): self.looter = ProfileLooter(USERNAME, template="test") self.destfs = fs.memoryfs.MemoryFS() def tearDown(self): self.destfs.close() def test_login(self): self.assertFalse(self.looter.logged_in()) self.assertRaises(RuntimeError, self.looter.medias) self.assertFalse(self.looter._cachefs.exists(self.looter._COOKIE_FILE)) try: self.looter.login(USERNAME, PASSWORD) self.assertTrue(self.looter.logged_in()) self.assertTrue( self.looter._cachefs.exists(self.looter._COOKIE_FILE)) self.assertTrue(next(self.looter.medias())) finally: self.looter.logout() self.assertFalse( self.looter._cachefs.exists(self.looter._COOKIE_FILE)) def test_download(self): try: self.looter.login(USERNAME, PASSWORD) self.looter.download(self.destfs) self.assertTrue(self.destfs.exists('test.jpg')) self.assertEqual(self.destfs.getbytes('test.jpg')[6:10], b'JFIF') finally: self.looter.logout()
class TestLogin(unittest.TestCase): @classmethod def setUpClass(cls): cls.session = requests.Session() InstaLooter._user_agent = cls.session.headers["User-Agent"] @classmethod def tearDownClass(cls): cls.session.close() del InstaLooter._user_agent def setUp(self): self.looter = ProfileLooter(USERNAME, template="test") self.destfs = fs.memoryfs.MemoryFS() def tearDown(self): self.destfs.close() def test_login(self): self.assertFalse(self.looter.logged_in()) self.assertRaises(RuntimeError, self.looter.medias) self.assertFalse(self.looter._cachefs.exists(self.looter._COOKIE_FILE)) try: self.looter.login(USERNAME, PASSWORD) self.assertTrue(self.looter.logged_in()) self.assertTrue( self.looter._cachefs.exists(self.looter._COOKIE_FILE)) self.assertTrue(next(self.looter.medias())) finally: self.looter.logout() self.assertFalse( self.looter._cachefs.exists(self.looter._COOKIE_FILE)) def test_download(self): try: self.looter.login(USERNAME, PASSWORD) self.looter.download(self.destfs) self.assertTrue(self.destfs.exists('test.jpg')) self.assertEqual(self.destfs.getbytes('test.jpg')[6:10], b'JFIF') finally: self.looter.logout()
from instalooter.looters import ProfileLooter import datetime import dateutil.relativedelta # instalooter_test downloads videos posted by daquan in the last month # Instanciate looter = ProfileLooter("daquan", videos_only=True, template="{id}-{username}-{width}-{height}") looter.login("", "") today = datetime.date.today() thismonth = (today, today - dateutil.relativedelta.relativedelta(days=28)) looter.download('./Memes_December_4', media_count=50, timeframe=thismonth)
class InstagramFeedMediaChannelMixin(object): LISTING_CLASS = InstagramMediaListing POST_TYPE_MAP = { "GraphImage": "image", "GraphVideo": "video", "GraphSidecar": "carousel" } looter_ : typing.Any = None @property @db_session def end_cursor(self): return self.attrs.get("end_cursor", None) @db_session def save_end_cursor(self, timestamp, end_cursor): self.attrs["end_cursor"] = [timestamp, end_cursor] commit() @property def looter(self): if not hasattr(self, "looter_") or not self.looter_ or self.looter_._username != self.locator[1:]: self.looter_ = ProfileLooter(self.locator[1:]) if self.provider.config.credentials and not self.looter_.logged_in: self.looter_.login(**self.provider.session_params) return self.looter_ def get_post_info(self, shortcode): return self.looter.get_post_info(shortcode) @property def posts(self): url = f"https://www.instagram.com/{self.locator[1:]}/?__a=1" data = self.looter.session.get(url).json() return data["graphql"]["user"]["edge_owner_to_timeline_media"]["count"] def extract_content(self, post): media_type = self.POST_TYPE_MAP[post["__typename"]] if media_type == "image": content = [ dict( url = post.display_url, media_type = media_type, shortcode = post.shortcode ) ] elif media_type == "video": if post.get("video_url"): content = [ dict( url = post.video_url, url_thumbnail = post.display_url, media_type = media_type, shortcode = post.shortcode ) ] else: content = [ dict( url = None, url_thumbnail = post.display_url, media_type = media_type, shortcode = post.shortcode ) ] elif media_type == "carousel": if post.get('edge_sidecar_to_children'): content = [ dict( url = s.video_url if s.is_video else s.display_url, url_thumbnail = s.display_url, media_type = "video" if s.is_video else "image", shortcode = post.shortcode ) for s in [AttrDict(e['node']) for e in post['edge_sidecar_to_children']['edges']] ] else: content = [ dict( url = None, url_thumbnail = post.display_url, media_type = media_type ) ] else: raise Exception(f"invalid media type: {media_type}") return content async def fetch(self, limit=None, resume=False, replace=False): logger.info(f"fetching {self.locator} {resume}, {replace}") # update cached post count with db_session: self.attrs["posts"] = self.posts try: (_, end_cursor) = self.end_cursor if resume else None except TypeError: end_cursor = None logger.info(f"cursor: {end_cursor}") try: self.pages = self.looter.pages(cursor=end_cursor) except ValueError: self.looter_.logout() self.looter_.login( username=self.provider.session_params["username"], password=self.provider.session_params["password"], ) self.pages = self.looter.pages(cursor=end_cursor) # def get_posts(pages): # posts = list() # for page in pages: # cursor = page["edge_owner_to_timeline_media"]["page_info"]["end_cursor"] # for media in self.looter._medias(iter([page])): # posts.append((cursor, AttrDict(media))) # return posts # def get_posts(pages): try: for page in pages: cursor = page["edge_owner_to_timeline_media"]["page_info"]["end_cursor"] for media in self.looter._medias(iter([page])): yield (cursor, AttrDict(media)) except json.decoder.JSONDecodeError: logger.error("".join(traceback.format_exc())) raise StopIteration count = 0 new_count = 0 posts = state.event_loop.run_in_executor( None, get_posts, self.pages ) for end_cursor, post in await posts: count += 1 logger.info(f"cursor: {end_cursor}") logger.debug(f"{count} {new_count} {limit}") if new_count >= limit or new_count == 0 and count >= limit: break created_timestamp = post.get( "date", post.get("taken_at_timestamp") ) if end_cursor and (self.end_cursor is None or created_timestamp < self.end_cursor[0]): logger.info(f"saving end_cursor: {created_timestamp}, {self.end_cursor[0] if self.end_cursor else None}") self.save_end_cursor(created_timestamp, end_cursor) created = datetime.utcfromtimestamp(created_timestamp) i = self.items.select(lambda i: i.guid == post.shortcode).first() if i and not replace: logger.debug(f"old: {created}") return else: logger.debug(f"new: {created}") caption = ( post["edge_media_to_caption"]["edges"][0]["node"]["text"] if "edge_media_to_caption" in post and post["edge_media_to_caption"]["edges"] else post["caption"] if "caption" in post else None ) try: media_type = self.POST_TYPE_MAP[post["__typename"]] except: logger.warn(f"unknown post type: {post.__typename}") continue content = self.extract_content(post) i = dict( channel = self, guid = post.shortcode, title = (caption or "(no caption)").replace("\n", " "), created = created, media_type = media_type, sources = content, attrs = dict( short_code = post.shortcode ), is_inflated = media_type == "image" ) new_count += 1 yield i @db_session def reset(self): super().reset() if "post_iter" in self.attrs: del self.attrs["post_iter"] commit()
os.makedirs(ThumbsFilePath) UserFilePath='./users/' if not os.path.exists(UserFilePath): os.makedirs(UserFilePath) img_src=[] #Grab all the thumbnails for i in range(0,len(img)): img_src.append(img[i].get_attribute('src')) # print(img_src) os.system('wget -q -O '+ThumbsFilePath+followinglist[i]+'.jpg '+img_src[i]+' &') #Close selenium driver.quit() #Login into instagram looter=ProfileLooter("instagram") looter.login(username_,password_) #Loop through all the people who are being followed and grab their photo urls for i in followinglist: try: print(i) i = i.strip() looter=ProfileLooter(i) with open(UserFilePath+i+".txt", "a") as output: for media in looter.medias(): for link in instalinks(media,looter): if not (os.path.isfile(UserFilePath+i+"/"+link.split('/')[-1])): print(link) output.write("{}\n".format(link)) else: print("Image already exists") #Wget from the file