def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') fixture_files = glob.glob(os.path.join(fixtures_path, '*')) for file_path in fixture_files: basename = os.path.splitext(os.path.basename(file_path))[0] self.__dict__[basename] = open(file_path).read() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.test_dir = tempfile.mkdtemp() args = { 'usernames': ['test'], 'destination': self.test_dir, 'login_user': None, 'login_pass': None, 'quiet': True, 'maximum': 0, 'retain_username': False, 'media_metadata': False, 'media_types': ['image', 'video', 'story'], 'latest': False } self.scraper = InstagramScraper(**args)
def IG_train(logo_brand, maxImages, outDir): ''' Scrapes max Images from logo_brand name and saves it to a directory named <logo_brand> This only needs to scrape for pictures ''' destinationFolder = './datasets/' #append this with another argument, which will separate the images into folders args = { 'username': [logo_brand], 'verbose': 0, 'login_user': None, 'usernames': [logo_brand], 'quiet': True, 'tag': True, 'retain_username': True, 'media_types': ['image'], 'media_metadata': False, 'login_only': False, 'destination': outDir, 'maximum': maxImages, 'filename': None, 'filter': None, 'location': False, 'login_pass': None, 'latest': False, 'logo_name': logo_brand } scraper = InstagramScraper(**args) scraper.scrape_hashtag()
def IG_train(logo_brand, maxImages): ''' Scrapes max Images from logo_brand name and saves it to a directory named <logo_brand> This only needs to scrape for pictures ''' destinationFolder = './' args = { 'username': [logo_brand], 'verbose': 0, 'login_user': None, 'usernames': [logo_brand], 'quiet': False, 'tag': True, 'retain_username': True, 'media_types': ['image'], 'media_metadata': False, 'login_only': False, 'destination': destinationFolder, 'maximum': maxImages, 'filename': None, 'filter': None, 'location': False, 'login_pass': None, 'latest': False, 'logo_name': logo_brand } scraper = InstagramScraper(**args) scraper.scrape_hashtag() print(str(maxImages)+ " picutres from #"+ logo_brand + " saved in " + logo_brand +" folder") print("Please ensure all pictutes in " + logo_brand + " dir contain the "+logo_brand+ " logo")
class InstagramTests(unittest.TestCase): def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_first_page = open( os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open( os.path.join(fixtures_path, 'response_second_page.json')).read() self.test_dir = tempfile.mkdtemp() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True) def tearDown(self): shutil.rmtree(self.test_dir) def test_scrape(self): with requests_mock.Mocker() as m: m.get(MEDIA_URL.format(self.scraper.username), text=self.response_first_page) m.get(MEDIA_URL.format(self.scraper.username) + '?&max_id=' + self.max_id, text=self.response_second_page) m.get('https://fake-url.com/photo1.jpg', text="image1") m.get('https://fake-url.com/photo2.jpg', text="image2") m.get('https://fake-url.com/photo3.jpg', text="image3") self.scraper.scrape() # First page has photo1 and photo2, while second page has photo3. If photo3 # is opened, generator successfully traversed both pages. self.assertEqual( open(os.path.join(self.test_dir, 'photo3.jpg')).read(), "image3")
class InstagramTests(unittest.TestCase): def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') fixture_files = glob.glob(os.path.join(fixtures_path, '*')) for file_path in fixture_files: basename = os.path.splitext(os.path.basename(file_path))[0] self.__dict__[basename] = open(file_path).read() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.test_dir = tempfile.mkdtemp() args = { 'usernames': ['test'], 'destination': self.test_dir, 'login_user': None, 'login_pass': None, 'quiet': True, 'maximum': 0, 'retain_username': False, 'media_metadata': False, 'media_types': ['image', 'video', 'story'], 'latest': False } self.scraper = InstagramScraper(**args) def tearDown(self): shutil.rmtree(self.test_dir) def test_scrape(self): with requests_mock.Mocker() as m: m.get(BASE_URL + self.scraper.usernames[0], text=self.response_user_metadata) m.get(MEDIA_URL.format(self.scraper.usernames[0]), text=self.response_first_page) m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' + self.max_id, text=self.response_second_page) m.get('https://fake-url.com/photo1.jpg', text="image1") m.get('https://fake-url.com/photo2.jpg', text="image2") m.get('https://fake-url.com/photo3.jpg', text="image3") self.scraper.scrape() # First page has photo1 and photo2, while second page has photo3. If photo3 # is opened, generator successfully traversed both pages. self.assertEqual(open(os.path.join(self.test_dir, 'photo3.jpg')).read(), "image3") def test_scrape_hashtag(self): with requests_mock.Mocker() as m: m.get(QUERY_HASHTAG.format(self.scraper.usernames[0], ''), text=self.response_query_hashtag_first_page, status_code=200) m.get(QUERY_HASHTAG.format(self.scraper.usernames[0], 'J0'), text=self.response_query_hashtag_second_page, status_code=200) m.get('https://fake-url.com/photo4.jpg', text="image4") self.scraper.scrape_hashtag() self.assertEqual(open(os.path.join(self.test_dir, 'photo4.jpg')).read(), "image4") shutil.rmtree(self.test_dir)
def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_first_page = open( os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open( os.path.join(fixtures_path, 'response_second_page.json')).read() self.test_dir = tempfile.mkdtemp() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)
def scrape_tag(tag, directory, num_images=50): """ Scrape instagram and download hash tag photos :param tag: instagram hash tag :param directory: destination directory :param num_images: number of images """ args = { 'usernames': [tag], 'destination': directory, 'login_user': None, 'login_pass': None, 'quiet': True, 'maximum': num_images, 'retain_username': False, 'media_metadata': False, 'media_types': ['image'], 'latest': False } scraper = InstagramScraper(**args) scraper.authenticate_as_guest() scraper.scrape_hashtag() scraper.save_cookies()
def IG_operate(logo_brand, hashtagList, maxImages): ''' scrapes maxImages number of images from each hashtag in hastaglist Includes relevant metadata makes a dictionary with constants defined at top of file DOES NOT save on harddrive compreses and serializes Calls Lucas's functions ''' config = NFS_Controller_Config(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_KEY) ic = InputController(config) print "max in here" + str(maxImages) #saves it to director ipe = InstagramPostEntities(isClassification=True) for tag in hashtagList: args = { #use a list here instead of a loop 'username': [str(tag)], 'verbose': 0, 'login_user': None, 'usernames': [str(tag)], 'quiet': False, 'tag': True, 'retain_username': True, 'include_location': True, 'media_types': ['image'], 'media_metadata': True, 'search_location': False, 'login_only': False, 'destination': './', 'maximum': int(maxImages), 'comments': False, 'filename': None, 'filter': None, 'location': False, 'login_pass': None, 'latest': False, 'logo_name': logo_brand } print "before scraper" scraper = InstagramScraper(**args) print "after scraper" ipe.extend(scraper.logout()) print "after extend" print("Operate complete") #print(ipe.serialize()) # lsc.upload_brand_operational_input_data(logo_brand, ipe.serialize(), isProcessed = False) ic.upload_brand_operational_input_IPE(logo_brand, ipe, False)
def IG_operate(logo_brand, hashtagList, maxImages): ''' scrapes maxImages number of images from each hashtag in hastaglist Includes relevant metadata makes a dictionary with constants defined at top of file DOES NOT save on harddrive compreses and serializes Calls Lucas's functions ''' #saves it to director picList = [] # list of dictionaries of metadata + string version of list destinationFolder = './' for tag in hashtagList: args = { #use a list here instead of a loop 'username': [tag], 'verbose': 0, 'login_user': None, 'usernames': [tag], 'quiet': False, 'tag': True, 'retain_username': True, 'include_location': True, 'media_types': ['image'], 'media_metadata': True, 'search_location': False, 'login_only': False, 'destination': destinationFolder, 'maximum': maxImages, 'comments': False, 'filename': None, 'filter': None, 'location': False, 'login_pass': None, 'latest': False, 'logo_name': logo_brand } scraper = InstagramScraper(**args) listScraperReturned = scraper.scrape_hashtag_operate() if listScraperReturned is not None: picList.extend(listScraperReturned) pickledPicList = data_transformer.toPickle(picList) #for testing, call this to unserialize data_transformer.unserialize_operate(pickledPicList) #call luca's function here print "Operate complete"
def do_scrape(self): self.log.info('Scraping IG as ' + self.app.conf['SCRAPE_IG_USER']) scraper = InstagramScraper( login_user=self.app.conf['SCRAPE_IG_USER'], login_pass=self.app.conf['SCRAPE_IG_PASS'], usernames=self.app.conf['SCRAPE_IG_FRIENDS'], destination=self.app.conf['IMAGES_REVIEW_DIR'], retain_username=True, media_types=['image', 'story-image'], maximum=self.app.conf['SCRAPE_IG_MAX_PER_FRIEND'], latest_stamps=os.path.join(self.app.conf['STORAGE_ROOT'], 'ig_user_stamps.ini'), ) scraper.scrape() self.log.info('Done scraping IG without errors as ' + self.app.conf['SCRAPE_IG_USER'])
def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_user_metadata = open( os.path.join(fixtures_path, 'response_user_metadata.json')).read() self.response_first_page = open( os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open( os.path.join(fixtures_path, 'response_second_page.json')).read() self.response_explore_tags = open( os.path.join(fixtures_path, 'response_explore_tags.json')).read() self.response_query_hashtag_first_page = open( os.path.join(fixtures_path, 'response_query_hashtag_first_page.json')).read() self.response_query_hashtag_second_page = open( os.path.join(fixtures_path, 'response_query_hashtag_second_page.json')).read() self.response_view_media_video = open( os.path.join(fixtures_path, 'response_view_media_video.json')).read() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.test_dir = tempfile.mkdtemp() args = { 'usernames': ['test'], 'destination': self.test_dir, 'login_user': None, 'login_pass': None, 'quiet': True, 'maximum': 0, 'retain_username': False, 'media_metadata': False, 'media_types': ['image', 'video', 'story'], 'latest': False } self.scraper = InstagramScraper(**args)
class InstagramTests(unittest.TestCase): def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_user_metadata = open(os.path.join(fixtures_path, 'response_user_metadata.json')).read() self.response_first_page = open(os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open(os.path.join(fixtures_path, 'response_second_page.json')).read() self.test_dir = tempfile.mkdtemp() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True) def tearDown(self): shutil.rmtree(self.test_dir) def test_scrape(self): with requests_mock.Mocker() as m: m.get(BASE_URL + self.scraper.usernames[0], text=self.response_user_metadata) m.get(MEDIA_URL.format(self.scraper.usernames[0]), text=self.response_first_page) m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' + self.max_id, text=self.response_second_page) m.get('https://fake-url.com/photo1.jpg', text="image1") m.get('https://fake-url.com/photo2.jpg', text="image2") m.get('https://fake-url.com/photo3.jpg', text="image3") self.scraper.scrape() # First page has photo1 and photo2, while second page has photo3. If photo3 # is opened, generator successfully traversed both pages. self.assertEqual(open(os.path.join(self.test_dir, 'photo3.jpg')).read(), "image3")
def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_user_metadata = open(os.path.join(fixtures_path, 'response_user_metadata.json')).read() self.response_first_page = open(os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open(os.path.join(fixtures_path, 'response_second_page.json')).read() self.test_dir = tempfile.mkdtemp() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)
from Margatsni import application from instagram_scraper import InstagramScraper from flask import Flask, request, render_template, session, redirect, flash, send_file from bs4 import BeautifulSoup import os, requests, shutil, json, concurrent.futures, tqdm, re LOGIN_URL = "https://www.instagram.com/accounts/login/ajax/" logged_in = False api = InstagramScraper(media_types=['image', 'story', 'video'], maximum=100) '''------------------------------------------------------- page views ----------------------------------------------------''' # main page @application.route('/') def index(): return render_template('index.html') # log-in page, will detect invalid logins @application.route('/login', methods=['GET', 'POST']) def login(): session['logged_in'] = False if request.method == 'POST': session['login_user'] = request.form['username'] session['login_pass'] = request.form['password'] login_text, login = validateUser() if login_text.get('authenticated') and login.status_code == 200: api.login_user = session['login_user'] api.login_pass = session['login_pass'] api.login()
def make_dat_for_json(datas): if type(datas['picture_list']) == list: datas['picture_list'] = ','.join(datas['picture_list']) if type(datas['hashtag']) == list: datas['hashtag'] = ''.join(datas['hashtag']) return datas if __name__ == '__main__': with open('privacy.txt', 'r') as f: id = f.readline() password = f.readline() feed_post_url = f.readline() image_post_url = f.readline() insta_scrap = InstagramScraper() ctl = InstagramScraperController() insta_scrap.login(id, password) with open('hash_tag.txt', 'r', encoding='UTF8') as f: tag_list = f.readlines() tag_list = list(map(lambda s: s.strip(), tag_list)) for tag in tag_list: search_tag = tag now_time = datetime.now() insta_scrap.search_by_tag(search_tag) error_cnt = 0 cnt = 0 #해당 해쉬태그에서 feed 정보를 수집한다.
def main(): parser = argparse.ArgumentParser( description= "instagram-scraper scrapes and downloads an instagram user's photos and videos.", epilog=textwrap.dedent(""" You can hide your credentials from the history, by reading your username from a local file: $ instagram-scraper @insta_args.txt user_to_scrape with insta_args.txt looking like this: -u=my_username -p=my_password You can add all arguments you want to that file, just remember to have one argument per line. Customize filename: by adding option --template or -T Default is: {urlname} And there are some option: {username}: Instagram user(s) to scrape. {shortcode}: post shortcode, but profile_pic and story are none. {urlname}: filename form url. {mediatype}: type of media. {datetime}: date and time that photo/video post on, format is: 20180101 01h01m01s {date}: date that photo/video post on, format is: 20180101 {year}: format is: 2018 {month}: format is: 01-12 {day}: format is: 01-31 {h}: hour, format is: 00-23h {m}: minute, format is 00-59m {s}: second, format is 00-59s """), formatter_class=argparse.RawDescriptionHelpFormatter, fromfile_prefix_chars='@') parser.add_argument('username', help='Instagram user(s) to scrape', nargs='*') parser.add_argument('-limit', '-l', help='Number of files to generate (default: %s)' % DEFAULT_DOWNLOAD_LIMIT, type=int) parser.add_argument('--destination', '-d', default='./', help='Download destination') parser.add_argument('--login-user', '--login_user', '-u', default=None, help='Instagram login user') parser.add_argument('--login-pass', '--login_pass', '-p', default=None, help='Instagram login password') parser.add_argument( '--followings-input', '--followings_input', action='store_true', default=False, help='Compile list of profiles followed by login-user to use as input') parser.add_argument('--followings-output', '--followings_output', help='Output followings-input to file in destination') parser.add_argument( '--filename', '-f', help='Path to a file containing a list of users to scrape') parser.add_argument('--quiet', '-q', default=False, action='store_true', help='Be quiet while scraping') parser.add_argument('--maximum', '-m', type=int, default=0, help='Maximum number of items to scrape') parser.add_argument( '--retain-username', '--retain_username', '-n', action='store_true', default=False, help='Creates username subdirectory when destination flag is set') parser.add_argument('--media-metadata', '--media_metadata', action='store_true', default=False, help='Save media metadata to json file') parser.add_argument('--profile-metadata', '--profile_metadata', action='store_true', default=False, help='Save profile metadata to json file') parser.add_argument('--proxies', default={}, help='Maximum number of items to scrape') parser.add_argument( '--include-location', '--include_location', action='store_true', default=False, help='Include location data when saving media metadata') parser.add_argument('--media-types', '--media_types', '-t', nargs='+', default=['image', 'video', 'story'], help='Specify media types to scrape') parser.add_argument('--latest', action='store_true', default=False, help='Scrape new media since the last scrape') parser.add_argument( '--latest-stamps', '--latest_stamps', default=None, help='Scrape new media since timestamps by user in specified file') parser.add_argument( '--cookiejar', '--cookierjar', default=None, help= 'File in which to store cookies so that they can be reused between runs.' ) parser.add_argument('--tag', action='store_true', default=False, help='Scrape media using a hashtag') parser.add_argument('--filter', default=None, help='Filter by tags in user posts', nargs='*') parser.add_argument('--location', action='store_true', default=False, help='Scrape media using a location-id') parser.add_argument('--search-location', action='store_true', default=False, help='Search for locations by name') parser.add_argument('--comments', action='store_true', default=False, help='Save post comments to json file') parser.add_argument('--no-check-certificate', action='store_true', default=False, help='Do not use ssl on transaction') parser.add_argument('--interactive', '-i', action='store_true', default=False, help='Enable interactive login challenge solving') parser.add_argument( '--retry-forever', action='store_true', default=False, help='Retry download attempts endlessly when errors are received') parser.add_argument('--verbose', '-v', type=int, default=0, help='Logging verbosity level') parser.add_argument('--template', '-T', type=str, default='{urlname}', help='Customize filename template') args = parser.parse_args() if (args.login_user and args.login_pass is None) or (args.login_user is None and args.login_pass): parser.print_help() raise ValueError('Must provide login user AND password') if not args.username and args.filename is None and not args.followings_input: parser.print_help() raise ValueError( 'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input' ) elif (args.username and args.filename) or (args.username and args.followings_input) or ( args.filename and args.followings_input): parser.print_help() raise ValueError( 'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input' ) if args.tag and args.location: parser.print_help() raise ValueError( 'Must provide only one of the following: hashtag OR location') if args.tag and args.filter: parser.print_help() raise ValueError('Filters apply to user posts') if args.filename: args.usernames = InstagramScraper.parse_file_usernames(args.filename) else: args.usernames = InstagramScraper.parse_delimited_str(','.join( args.username)) if args.media_types and len( args.media_types) == 1 and re.compile(r'[,;\s]+').findall( args.media_types[0]): args.media_types = InstagramScraper.parse_delimited_str( args.media_types[0]) if args.retry_forever: global MAX_RETRIES MAX_RETRIES = sys.maxsize scraper = InstagramScraper(**vars(args)) if args.login_user and args.login_pass: scraper.authenticate_with_login() else: scraper.authenticate_as_guest() if args.followings_input: scraper.usernames = list( scraper.query_followings_gen(scraper.login_user)) if args.followings_output: with open(scraper.destination + scraper.followings_output, 'w') as file: for username in scraper.usernames: file.write(username + "\n") # If not requesting anything else, exit if args.media_types == ['none'] and args.media_metadata is False: scraper.logout() return if args.tag: scraper.scrape_hashtag() elif args.location: scraper.scrape_location() elif args.search_location: scraper.search_locations() else: scraper.scrape() scraper.save_cookies() with open("imgurls.txt", "a", encoding="utf8") as f: f.write("{") number = args.limit if number == None or number == 0: number = 1000 for username in args.usernames: org_path = username all_img = glob.glob(org_path + "/*.jpg") for org_img in all_img: outfile = org_img.replace(username + "\\", "") path = r'emoji' # use your path all_emoji = glob.glob(path + "/*.png") all_count = number while all_count > 0: img = cv2.imread(org_img) img_cnt = random.randint(1, 6) height, width, channels = img.shape while img_cnt > 0: id = randrange(33) choose_emoji = all_emoji[id] overlay_t = cv2.imread(choose_emoji, -1) img = overlay_transparent(img, overlay_t, random.randint(0, width - 75), random.randint(0, height - 75), (75, 75)) img_cnt -= 1 output = "result/" + str(outfile) + "-" + str( all_count) + ".png" cv2.imwrite(output, img) all_count -= 1 org_path = 'result' all_images = glob.glob(org_path + "/*.png") arr = np.array_split(all_images, 5) with open("imgurls.txt", "a", encoding="utf8") as f: f.write("{") with Pool(processes=5) as pool: pool.map(UploadingImage, arr) files = glob.glob('result/*') for f in files: os.remove(f) with open("imgurls.txt", 'rb+') as filehandle: filehandle.seek(-1, os.SEEK_END) filehandle.truncate() with open("imgurls.txt", "a", encoding="utf8") as f: f.write("}|") print("image done:" + org_img) with open("imgurls.txt", 'rb+') as filehandle: filehandle.seek(-1, os.SEEK_END) filehandle.truncate() with open("imgurls.txt", "a", encoding="utf8") as f: f.write("}")
def initInstagramScraper(self, user_name): args = {'username': user_name} self.scraper = InstagramScraper(**args)
def main(): parser = argparse.ArgumentParser( description= "instagram-scraper scrapes and downloads an instagram user's photos and videos.", epilog=textwrap.dedent(""" You can hide your credentials from the history, by reading your username from a local file: $ instagram-scraper @insta_args.txt user_to_scrape with insta_args.txt looking like this: -u=my_username -p=my_password You can add all arguments you want to that file, just remember to have one argument per line. """), formatter_class=argparse.RawDescriptionHelpFormatter, fromfile_prefix_chars='@') parser.add_argument('username', help='Instagram user(s) to scrape', nargs='*') parser.add_argument('--destination', '-d', default='./', help='Download destination') parser.add_argument('--login-user', '--login_user', '-u', default=None, help='Instagram login user') parser.add_argument('--login-pass', '--login_pass', '-p', default=None, help='Instagram login password') parser.add_argument('--login-only', '--login_only', '-l', default=False, action='store_true', help='Disable anonymous fallback if login fails') parser.add_argument( '--filename', '-f', help='Path to a file containing a list of users to scrape') parser.add_argument('--quiet', '-q', default=False, action='store_true', help='Be quiet while scraping') parser.add_argument('--maximum', '-m', type=int, default=0, help='Maximum number of items to scrape') parser.add_argument( '--retain-username', '--retain_username', '-n', action='store_true', default=False, help='Creates username subdirectory when destination flag is set') parser.add_argument('--media-metadata', '--media_metadata', action='store_true', default=False, help='Save media metadata to json file') parser.add_argument( '--include-location', '--include_location', action='store_true', default=False, help='Include location data when saving media metadata') parser.add_argument('--media-types', '--media_types', '-t', nargs='+', default=['image', 'video', 'story'], help='Specify media types to scrape') parser.add_argument('--latest', action='store_true', default=False, help='Scrape new media since the last scrape') parser.add_argument( '--latest-stamps', '--latest_stamps', default=None, help='Scrape new media since timestamps by user in specified file') parser.add_argument('--tag', action='store_true', default=False, help='Scrape media using a hashtag') parser.add_argument('--filter', default=None, help='Filter by tags in user posts', nargs='*') parser.add_argument('--location', action='store_true', default=False, help='Scrape media using a location-id') parser.add_argument('--search-location', action='store_true', default=False, help='Search for locations by name') parser.add_argument('--comments', action='store_true', default=False, help='Save post comments to json file') parser.add_argument('--interactive', '-i', action='store_true', default=False, help='Enable interactive login challenge solving') parser.add_argument( '--retry-forever', action='store_true', default=False, help='Retry download attempts endlessly when errors are received') parser.add_argument('--verbose', '-v', type=int, default=0, help='Logging verbosity level') args = parser.parse_args() if (args.login_user and args.login_pass is None) or (args.login_user is None and args.login_pass): parser.print_help() raise ValueError('Must provide login user AND password') if not args.username and args.filename is None: parser.print_help() raise ValueError( 'Must provide username(s) OR a file containing a list of username(s)' ) elif args.username and args.filename: parser.print_help() raise ValueError( 'Must provide only one of the following: username(s) OR a filename containing username(s)' ) if args.tag and args.location: parser.print_help() raise ValueError( 'Must provide only one of the following: hashtag OR location') if args.tag and args.filter: parser.print_help() raise ValueError('Filters apply to user posts') if args.filename: args.usernames = InstagramScraper.parse_file_usernames(args.filename) else: args.usernames = InstagramScraper.parse_delimited_str(','.join( args.username)) if args.media_types and len( args.media_types) == 1 and re.compile(r'[,;\s]+').findall( args.media_types[0]): args.media_types = InstagramScraper.parse_delimited_str( args.media_types[0]) if args.retry_forever: global MAX_RETRIES MAX_RETRIES = sys.maxsize scraper = InstagramScraper(**vars(args)) # if args.tag: # scraper.scrape_hashtag() # elif args.location: # scraper.scrape_location() # elif args.search_location: # scraper.search_locations() # else: # scraper.scrape() get_simple_summary(scraper)
class InstagramTests(unittest.TestCase): def setUp(self): fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures') self.response_user_metadata = open( os.path.join(fixtures_path, 'response_user_metadata.json')).read() self.response_first_page = open( os.path.join(fixtures_path, 'response_first_page.json')).read() self.response_second_page = open( os.path.join(fixtures_path, 'response_second_page.json')).read() self.response_explore_tags = open( os.path.join(fixtures_path, 'response_explore_tags.json')).read() self.response_query_hashtag_first_page = open( os.path.join(fixtures_path, 'response_query_hashtag_first_page.json')).read() self.response_query_hashtag_second_page = open( os.path.join(fixtures_path, 'response_query_hashtag_second_page.json')).read() self.response_view_media_video = open( os.path.join(fixtures_path, 'response_view_media_video.json')).read() # This is a max id of the last item in response_first_page.json. self.max_id = "1369793132326237681_50955533" self.test_dir = tempfile.mkdtemp() args = { 'usernames': ['test'], 'destination': self.test_dir, 'login_user': None, 'login_pass': None, 'quiet': True, 'maximum': 0, 'retain_username': False, 'media_metadata': False, 'media_types': ['image', 'video', 'story'], 'latest': False } self.scraper = InstagramScraper(**args) def tearDown(self): shutil.rmtree(self.test_dir) def test_scrape(self): with requests_mock.Mocker() as m: m.get(BASE_URL + self.scraper.usernames[0], text=self.response_user_metadata) m.get(MEDIA_URL.format(self.scraper.usernames[0]), text=self.response_first_page) m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' + self.max_id, text=self.response_second_page) m.get('https://fake-url.com/photo1.jpg', text="image1") m.get('https://fake-url.com/photo2.jpg', text="image2") m.get('https://fake-url.com/photo3.jpg', text="image3") self.scraper.scrape() # First page has photo1 and photo2, while second page has photo3. If photo3 # is opened, generator successfully traversed both pages. self.assertEqual( open(os.path.join(self.test_dir, 'photo3.jpg')).read(), "image3") def test_scrape_hashtag(self): with requests_mock.Mocker() as m: m.get(TAGS_URL.format('test'), text=self.response_explore_tags, cookies={'csrftoken': 'token'}) m.post(QUERY_URL, [{ 'text': self.response_query_hashtag_first_page, 'status_code': 200 }, { 'text': self.response_query_hashtag_second_page, 'status_code': 200 }]) m.get(VIEW_MEDIA_URL.format('code4'), text=self.response_view_media_video) m.get('https://fake-url.com/video.mp4', text="video") self.scraper.scrape_hashtag() self.assertEqual( open(os.path.join(self.test_dir, 'video.mp4')).read(), "video")
args = { 'usernames': [userName], 'destination': userName, 'login-user': login_user, 'login-pass': login_pass, 'quiet': True, 'maximum': 0, 'retain_username': False, 'media_metadata': True, 'media_types': ['image'], 'latest': False, 'profile_metadata': True } scraper = InstagramScraper(**args) scraper.scrape() with open(userName + "\\" + userName + ".json", encoding="utf8") as f: data = json.load(f) document = Document() document.add_heading('@' + userName, 0) document.add_heading(data["GraphProfileInfo"]["info"]["biography"], 1) cnt = 0 for i in range(len(data["GraphImages"])): if data["GraphImages"][i]["__typename"] == "GraphImage" or data[ "GraphImages"][i]["__typename"] == "GraphSidecar": cnt += 1
def scrape_photos(sourceUserFolder): parser = argparse.ArgumentParser( description= "instagram-scraper scrapes and downloads an instagram user's photos and videos.", epilog=textwrap.dedent(""" You can hide your credentials from the history, by reading your username from a local file: $ instagram-scraper @insta_args.txt user_to_scrape with insta_args.txt looking like this: -u=my_username -p=my_password You can add all arguments you want to that file, just remember to have one argument per line. Customize filename: by adding option --template or -T Default is: {urlname} And there are some option: {username}: Instagram user(s) to scrape. {shortcode}: post shortcode, but profile_pic and story are none. {urlname}: filename form url. {mediatype}: type of media. {datetime}: date and time that photo/video post on, format is: 20180101 01h01m01s {date}: date that photo/video post on, format is: 20180101 {year}: format is: 2018 {month}: format is: 01-12 {day}: format is: 01-31 {h}: hour, format is: 00-23h {m}: minute, format is 00-59m {s}: second, format is 00-59s """), formatter_class=argparse.RawDescriptionHelpFormatter, fromfile_prefix_chars='@') parser.add_argument('username', help='Instagram user(s) to scrape', nargs='*') parser.add_argument('--destination', '-d', default=sourceUserFolder, help='Download destination') parser.add_argument('--login-user', '--login_user', '-u', default=None, help='Instagram login user') parser.add_argument('--login-pass', '--login_pass', '-p', default=None, help='Instagram login password') parser.add_argument( '--followings-input', '--followings_input', action='store_true', default=False, help='Compile list of profiles followed by login-user to use as input') parser.add_argument('--followings-output', '--followings_output', help='Output followings-input to file in destination') parser.add_argument( '--filename', '-f', help='Path to a file containing a list of users to scrape') parser.add_argument('--quiet', '-q', default=False, action='store_true', help='Be quiet while scraping') parser.add_argument('--maximum', '-m', type=int, default=0, help='Maximum number of items to scrape') parser.add_argument( '--retain-username', '--retain_username', '-n', action='store_true', default=False, help='Creates username subdirectory when destination flag is set') parser.add_argument('--media-metadata', '--media_metadata', action='store_true', default=False, help='Save media metadata to json file') parser.add_argument('--profile-metadata', '--profile_metadata', action='store_true', default=False, help='Save profile metadata to json file') parser.add_argument( '--proxies', default={}, help= 'Enable use of proxies, add a valid JSON with http or/and https urls.') parser.add_argument( '--include-location', '--include_location', action='store_true', default=False, help='Include location data when saving media metadata') parser.add_argument('--media-types', '--media_types', '-t', nargs='+', default=['image', 'video', 'story'], help='Specify media types to scrape') parser.add_argument('--latest', action='store_true', default=False, help='Scrape new media since the last scrape') parser.add_argument( '--latest-stamps', '--latest_stamps', default=None, help='Scrape new media since timestamps by user in specified file') parser.add_argument( '--cookiejar', '--cookierjar', default=None, help= 'File in which to store cookies so that they can be reused between runs.' ) parser.add_argument('--tag', action='store_true', default=False, help='Scrape media using a hashtag') parser.add_argument('--filter', default=None, help='Filter by tags in user posts', nargs='*') parser.add_argument( '--filter_location', default=None, nargs="*", help= "filter query by only accepting media with location filter as the location id" ) parser.add_argument( '--filter_location_file', default=None, type=str, help="file containing list of locations to filter query by") parser.add_argument('--location', action='store_true', default=False, help='Scrape media using a location-id') parser.add_argument('--search-location', action='store_true', default=False, help='Search for locations by name') parser.add_argument('--comments', action='store_true', default=False, help='Save post comments to json file') parser.add_argument('--no-check-certificate', action='store_true', default=False, help='Do not use ssl on transaction') parser.add_argument('--interactive', '-i', action='store_true', default=False, help='Enable interactive login challenge solving') parser.add_argument( '--retry-forever', action='store_true', default=False, help='Retry download attempts endlessly when errors are received') parser.add_argument('--verbose', '-v', type=int, default=0, help='Logging verbosity level') parser.add_argument('--template', '-T', type=str, default='{urlname}', help='Customize filename template') parser.add_argument( '--log_destination', '-l', type=str, default='', help='destination folder for the instagram-scraper.log file') args = parser.parse_args() # Need to set destination folder if (args.login_user and args.login_pass is None) or (args.login_user is None and args.login_pass): parser.print_help() raise ValueError('Must provide login user AND password') if not args.username and args.filename is None and not args.followings_input: parser.print_help() raise ValueError( 'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input' ) elif (args.username and args.filename) or (args.username and args.followings_input) or ( args.filename and args.followings_input): parser.print_help() raise ValueError( 'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input' ) if args.tag and args.location: parser.print_help() raise ValueError( 'Must provide only one of the following: hashtag OR location') if args.tag and args.filter: parser.print_help() raise ValueError('Filters apply to user posts') if (args.filter_location or args.filter_location_file) and not args.include_location: parser.print_help() raise ValueError( 'Location filter needs locations in metadata to filter properly') if args.filename: args.usernames = InstagramScraper.get_values_from_file(args.filename) else: args.usernames = InstagramScraper.parse_delimited_str(','.join( args.username)) if args.filter_location_file: args.filter_locations = InstagramScraper.get_values_from_file( args.filter_location_file) elif args.filter_location: args.filter_locations = InstagramScraper.parse_delimited_str(','.join( args.filter_location)) if args.media_types and len( args.media_types) == 1 and re.compile(r'[,;\s]+').findall( args.media_types[0]): args.media_types = InstagramScraper.parse_delimited_str( args.media_types[0]) if args.retry_forever: global MAX_RETRIES MAX_RETRIES = sys.maxsize scraper = InstagramScraper(**vars(args)) if args.login_user and args.login_pass: scraper.authenticate_with_login() else: scraper.authenticate_as_guest() if args.followings_input: scraper.usernames = list( scraper.query_followings_gen(scraper.login_user)) if args.followings_output: with open(scraper.destination + scraper.followings_output, 'w') as file: for username in scraper.usernames: file.write(username + "\n") # If not requesting anything else, exit if args.media_types == ['none'] and args.media_metadata is False: scraper.logout() return if args.tag: scraper.scrape_hashtag() elif args.location: scraper.scrape_location() elif args.search_location: scraper.search_locations() else: scraper.scrape() scraper.save_cookies()