コード例 #1
0
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')

        fixture_files = glob.glob(os.path.join(fixtures_path, '*'))

        for file_path in fixture_files:
            basename = os.path.splitext(os.path.basename(file_path))[0]
            self.__dict__[basename] = open(file_path).read()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.test_dir = tempfile.mkdtemp()

        args = {
            'usernames': ['test'],
            'destination': self.test_dir,
            'login_user': None,
            'login_pass': None,
            'quiet': True,
            'maximum': 0,
            'retain_username': False,
            'media_metadata': False,
            'media_types': ['image', 'video', 'story'],
            'latest': False
        }

        self.scraper = InstagramScraper(**args)
コード例 #2
0
def IG_train(logo_brand, maxImages, outDir):
    '''
        Scrapes max Images from logo_brand name and saves it to a directory named
        <logo_brand>
        This only needs to scrape for pictures
    '''
    destinationFolder = './datasets/'  #append this with another argument, which will separate the images into folders
    args = {
        'username': [logo_brand],
        'verbose': 0,
        'login_user': None,
        'usernames': [logo_brand],
        'quiet': True,
        'tag': True,
        'retain_username': True,
        'media_types': ['image'],
        'media_metadata': False,
        'login_only': False,
        'destination': outDir,
        'maximum': maxImages,
        'filename': None,
        'filter': None,
        'location': False,
        'login_pass': None,
        'latest': False,
        'logo_name': logo_brand
    }

    scraper = InstagramScraper(**args)
    scraper.scrape_hashtag()
コード例 #3
0
def IG_train(logo_brand, maxImages):
    '''
        Scrapes max Images from logo_brand name and saves it to a directory named
        <logo_brand>
        This only needs to scrape for pictures
    '''
    destinationFolder = './'
    args = {
        'username': [logo_brand],
        'verbose': 0,
        'login_user': None,
        'usernames': [logo_brand],
        'quiet': False,
        'tag': True,
        'retain_username': True,
        'media_types': ['image'],
        'media_metadata': False,
        'login_only': False,
        'destination': destinationFolder,
        'maximum': maxImages,
        'filename': None,
        'filter': None,
        'location': False,
        'login_pass': None,
        'latest': False,
        'logo_name': logo_brand
    }

    scraper = InstagramScraper(**args)
    scraper.scrape_hashtag()
    print(str(maxImages)+ " picutres from #"+ logo_brand + " saved in " + logo_brand +" folder")
    print("Please ensure all pictutes in " + logo_brand + " dir contain the "+logo_brand+ " logo")
コード例 #4
0
class InstagramTests(unittest.TestCase):
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_first_page = open(
            os.path.join(fixtures_path, 'response_first_page.json')).read()
        self.response_second_page = open(
            os.path.join(fixtures_path, 'response_second_page.json')).read()

        self.test_dir = tempfile.mkdtemp()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)

    def tearDown(self):
        shutil.rmtree(self.test_dir)

    def test_scrape(self):
        with requests_mock.Mocker() as m:
            m.get(MEDIA_URL.format(self.scraper.username),
                  text=self.response_first_page)
            m.get(MEDIA_URL.format(self.scraper.username) + '?&max_id=' +
                  self.max_id,
                  text=self.response_second_page)
            m.get('https://fake-url.com/photo1.jpg', text="image1")
            m.get('https://fake-url.com/photo2.jpg', text="image2")
            m.get('https://fake-url.com/photo3.jpg', text="image3")
            self.scraper.scrape()

            # First page has photo1 and photo2, while second page has photo3. If photo3
            # is opened, generator successfully traversed both pages.
            self.assertEqual(
                open(os.path.join(self.test_dir, 'photo3.jpg')).read(),
                "image3")
コード例 #5
0
class InstagramTests(unittest.TestCase):
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        fixture_files = glob.glob(os.path.join(fixtures_path, '*'))
        for file_path in fixture_files:
            basename = os.path.splitext(os.path.basename(file_path))[0]
            self.__dict__[basename] = open(file_path).read()
        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"
        self.test_dir = tempfile.mkdtemp()
        args = {
            'usernames': ['test'],
            'destination': self.test_dir,
            'login_user': None,
            'login_pass': None,
            'quiet': True,
            'maximum': 0,
            'retain_username': False,
            'media_metadata': False,
            'media_types': ['image', 'video', 'story'],
            'latest': False
        }
        self.scraper = InstagramScraper(**args)

    def tearDown(self):
        shutil.rmtree(self.test_dir)


    def test_scrape(self):
        with requests_mock.Mocker() as m:
            m.get(BASE_URL + self.scraper.usernames[0], text=self.response_user_metadata)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]), text=self.response_first_page)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' + self.max_id,
                  text=self.response_second_page)
            m.get('https://fake-url.com/photo1.jpg', text="image1")
            m.get('https://fake-url.com/photo2.jpg', text="image2")
            m.get('https://fake-url.com/photo3.jpg', text="image3")

            self.scraper.scrape()

            # First page has photo1 and photo2, while second page has photo3. If photo3
            # is opened, generator successfully traversed both pages.
            self.assertEqual(open(os.path.join(self.test_dir, 'photo3.jpg')).read(),
                             "image3")

    def test_scrape_hashtag(self):
        with requests_mock.Mocker() as m:
            m.get(QUERY_HASHTAG.format(self.scraper.usernames[0], ''), text=self.response_query_hashtag_first_page, status_code=200)
            m.get(QUERY_HASHTAG.format(self.scraper.usernames[0], 'J0'), text=self.response_query_hashtag_second_page, status_code=200)

            m.get('https://fake-url.com/photo4.jpg', text="image4")

            self.scraper.scrape_hashtag()

            self.assertEqual(open(os.path.join(self.test_dir, 'photo4.jpg')).read(), "image4") 
        shutil.rmtree(self.test_dir) 
コード例 #6
0
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_first_page = open(
            os.path.join(fixtures_path, 'response_first_page.json')).read()
        self.response_second_page = open(
            os.path.join(fixtures_path, 'response_second_page.json')).read()

        self.test_dir = tempfile.mkdtemp()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)
コード例 #7
0
def scrape_tag(tag, directory, num_images=50):
    """
    Scrape instagram and download hash tag photos
    :param tag: instagram hash tag
    :param directory: destination directory
    :param num_images: number of images
    """

    args = {
        'usernames': [tag],
        'destination': directory,
        'login_user': None,
        'login_pass': None,
        'quiet': True,
        'maximum': num_images,
        'retain_username': False,
        'media_metadata': False,
        'media_types': ['image'],
        'latest': False
    }

    scraper = InstagramScraper(**args)
    scraper.authenticate_as_guest()
    scraper.scrape_hashtag()
    scraper.save_cookies()
コード例 #8
0
def IG_operate(logo_brand, hashtagList, maxImages):
    '''
        scrapes maxImages number of images from each hashtag in hastaglist
        Includes relevant metadata
        makes a dictionary with constants defined at top of file
        DOES NOT save on harddrive
        compreses and serializes
        Calls Lucas's functions
    '''
    config = NFS_Controller_Config(STORAGE_ACCOUNT_NAME, STORAGE_ACCOUNT_KEY)
    ic = InputController(config)
    print "max in here" + str(maxImages)
    #saves it to director
    ipe = InstagramPostEntities(isClassification=True)

    for tag in hashtagList:
        args = {
            #use a list here instead of a loop
            'username': [str(tag)],
            'verbose': 0,
            'login_user': None,
            'usernames': [str(tag)],
            'quiet': False,
            'tag': True,
            'retain_username': True,
            'include_location': True,
            'media_types': ['image'],
            'media_metadata': True,
            'search_location': False,
            'login_only': False,
            'destination': './',
            'maximum': int(maxImages),
            'comments': False,
            'filename': None,
            'filter': None,
            'location': False,
            'login_pass': None,
            'latest': False,
            'logo_name': logo_brand
        }
        print "before scraper"
        scraper = InstagramScraper(**args)
        print "after scraper"
        ipe.extend(scraper.logout())
        print "after extend"
    print("Operate complete")
    #print(ipe.serialize())
    # lsc.upload_brand_operational_input_data(logo_brand, ipe.serialize(), isProcessed = False)
    ic.upload_brand_operational_input_IPE(logo_brand, ipe, False)
コード例 #9
0
def IG_operate(logo_brand, hashtagList, maxImages):
    '''
        scrapes maxImages number of images from each hashtag in hastaglist
        Includes relevant metadata
        makes a dictionary with constants defined at top of file
        DOES NOT save on harddrive
        compreses and serializes
        Calls Lucas's functions
    '''
    #saves it to director

    picList = []  # list of dictionaries of metadata + string version of list
    destinationFolder = './'

    for tag in hashtagList:
        args = {
            #use a list here instead of a loop
            'username': [tag],
            'verbose': 0,
            'login_user': None,
            'usernames': [tag],
            'quiet': False,
            'tag': True,
            'retain_username': True,
            'include_location': True,
            'media_types': ['image'],
            'media_metadata': True,
            'search_location': False,
            'login_only': False,
            'destination': destinationFolder,
            'maximum': maxImages,
            'comments': False,
            'filename': None,
            'filter': None,
            'location': False,
            'login_pass': None,
            'latest': False,
            'logo_name': logo_brand
        }
        scraper = InstagramScraper(**args)
        listScraperReturned = scraper.scrape_hashtag_operate()
        if listScraperReturned is not None:
            picList.extend(listScraperReturned)
        pickledPicList = data_transformer.toPickle(picList)
        #for testing, call this to unserialize
    data_transformer.unserialize_operate(pickledPicList)
    #call luca's function here
    print "Operate complete"
コード例 #10
0
ファイル: __init__.py プロジェクト: Skinner927/ig-review
def do_scrape(self):
    self.log.info('Scraping IG as ' + self.app.conf['SCRAPE_IG_USER'])
    scraper = InstagramScraper(
        login_user=self.app.conf['SCRAPE_IG_USER'],
        login_pass=self.app.conf['SCRAPE_IG_PASS'],
        usernames=self.app.conf['SCRAPE_IG_FRIENDS'],
        destination=self.app.conf['IMAGES_REVIEW_DIR'],
        retain_username=True,
        media_types=['image', 'story-image'],
        maximum=self.app.conf['SCRAPE_IG_MAX_PER_FRIEND'],
        latest_stamps=os.path.join(self.app.conf['STORAGE_ROOT'],
                                   'ig_user_stamps.ini'),
    )
    scraper.scrape()
    self.log.info('Done scraping IG without errors as ' +
                  self.app.conf['SCRAPE_IG_USER'])
コード例 #11
0
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_user_metadata = open(
            os.path.join(fixtures_path, 'response_user_metadata.json')).read()
        self.response_first_page = open(
            os.path.join(fixtures_path, 'response_first_page.json')).read()
        self.response_second_page = open(
            os.path.join(fixtures_path, 'response_second_page.json')).read()

        self.response_explore_tags = open(
            os.path.join(fixtures_path, 'response_explore_tags.json')).read()

        self.response_query_hashtag_first_page = open(
            os.path.join(fixtures_path,
                         'response_query_hashtag_first_page.json')).read()

        self.response_query_hashtag_second_page = open(
            os.path.join(fixtures_path,
                         'response_query_hashtag_second_page.json')).read()

        self.response_view_media_video = open(
            os.path.join(fixtures_path,
                         'response_view_media_video.json')).read()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.test_dir = tempfile.mkdtemp()

        args = {
            'usernames': ['test'],
            'destination': self.test_dir,
            'login_user': None,
            'login_pass': None,
            'quiet': True,
            'maximum': 0,
            'retain_username': False,
            'media_metadata': False,
            'media_types': ['image', 'video', 'story'],
            'latest': False
        }

        self.scraper = InstagramScraper(**args)
コード例 #12
0
class InstagramTests(unittest.TestCase):

    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_user_metadata = open(os.path.join(fixtures_path,
                                                        'response_user_metadata.json')).read()
        self.response_first_page = open(os.path.join(fixtures_path,
                                                     'response_first_page.json')).read()
        self.response_second_page = open(os.path.join(fixtures_path,
                                                      'response_second_page.json')).read()

        self.test_dir = tempfile.mkdtemp()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)

    def tearDown(self):
        shutil.rmtree(self.test_dir)

    def test_scrape(self):
        with requests_mock.Mocker() as m:
            m.get(BASE_URL + self.scraper.usernames[0], text=self.response_user_metadata)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]), text=self.response_first_page)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' + self.max_id,
                  text=self.response_second_page)
            m.get('https://fake-url.com/photo1.jpg', text="image1")
            m.get('https://fake-url.com/photo2.jpg', text="image2")
            m.get('https://fake-url.com/photo3.jpg', text="image3")

            self.scraper.scrape()

            # First page has photo1 and photo2, while second page has photo3. If photo3
            # is opened, generator successfully traversed both pages.
            self.assertEqual(open(os.path.join(self.test_dir, 'photo3.jpg')).read(),
                             "image3")
コード例 #13
0
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_user_metadata = open(os.path.join(fixtures_path,
                                                        'response_user_metadata.json')).read()
        self.response_first_page = open(os.path.join(fixtures_path,
                                                     'response_first_page.json')).read()
        self.response_second_page = open(os.path.join(fixtures_path,
                                                      'response_second_page.json')).read()

        self.test_dir = tempfile.mkdtemp()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.scraper = InstagramScraper("test", dst=self.test_dir, quiet=True)
コード例 #14
0
ファイル: views.py プロジェクト: ieabbas/Margatsni
from Margatsni import application
from instagram_scraper import InstagramScraper
from flask import Flask, request, render_template, session, redirect, flash, send_file
from bs4 import BeautifulSoup
import os, requests, shutil, json, concurrent.futures, tqdm, re

LOGIN_URL = "https://www.instagram.com/accounts/login/ajax/"
logged_in = False
api = InstagramScraper(media_types=['image', 'story', 'video'], maximum=100)
'''------------------------------------------------------- page views ----------------------------------------------------'''


# main page
@application.route('/')
def index():
    return render_template('index.html')


# log-in page, will detect invalid logins
@application.route('/login', methods=['GET', 'POST'])
def login():
    session['logged_in'] = False
    if request.method == 'POST':
        session['login_user'] = request.form['username']
        session['login_pass'] = request.form['password']
        login_text, login = validateUser()

        if login_text.get('authenticated') and login.status_code == 200:
            api.login_user = session['login_user']
            api.login_pass = session['login_pass']
            api.login()
コード例 #15
0
def make_dat_for_json(datas):
    if type(datas['picture_list']) == list:
        datas['picture_list'] = ','.join(datas['picture_list'])
    if type(datas['hashtag']) == list:
        datas['hashtag'] = ''.join(datas['hashtag'])
    return datas


if __name__ == '__main__':
    with open('privacy.txt', 'r') as f:
        id = f.readline()
        password = f.readline()
        feed_post_url = f.readline()
        image_post_url = f.readline()
    insta_scrap = InstagramScraper()
    ctl = InstagramScraperController()
    insta_scrap.login(id, password)

    with open('hash_tag.txt', 'r', encoding='UTF8') as f:
        tag_list = f.readlines()
        tag_list = list(map(lambda s: s.strip(), tag_list))

    for tag in tag_list:
        search_tag = tag
        now_time = datetime.now()
        insta_scrap.search_by_tag(search_tag)
        error_cnt = 0

        cnt = 0
        #해당 해쉬태그에서 feed 정보를 수집한다.
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "instagram-scraper scrapes and downloads an instagram user's photos and videos.",
        epilog=textwrap.dedent("""
        You can hide your credentials from the history, by reading your
        username from a local file:

        $ instagram-scraper @insta_args.txt user_to_scrape

        with insta_args.txt looking like this:
        -u=my_username
        -p=my_password

        You can add all arguments you want to that file, just remember to have
        one argument per line.

        Customize filename:
        by adding option --template or -T
        Default is: {urlname}
        And there are some option:
        {username}: Instagram user(s) to scrape.
        {shortcode}: post shortcode, but profile_pic and story are none.
        {urlname}: filename form url.
        {mediatype}: type of media.
        {datetime}: date and time that photo/video post on,
                     format is: 20180101 01h01m01s
        {date}: date that photo/video post on,
                 format is: 20180101
        {year}: format is: 2018
        {month}: format is: 01-12
        {day}: format is: 01-31
        {h}: hour, format is: 00-23h
        {m}: minute, format is 00-59m
        {s}: second, format is 00-59s

        """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username',
                        help='Instagram user(s) to scrape',
                        nargs='*')
    parser.add_argument('-limit',
                        '-l',
                        help='Number of files to generate (default: %s)' %
                        DEFAULT_DOWNLOAD_LIMIT,
                        type=int)
    parser.add_argument('--destination',
                        '-d',
                        default='./',
                        help='Download destination')
    parser.add_argument('--login-user',
                        '--login_user',
                        '-u',
                        default=None,
                        help='Instagram login user')
    parser.add_argument('--login-pass',
                        '--login_pass',
                        '-p',
                        default=None,
                        help='Instagram login password')
    parser.add_argument(
        '--followings-input',
        '--followings_input',
        action='store_true',
        default=False,
        help='Compile list of profiles followed by login-user to use as input')
    parser.add_argument('--followings-output',
                        '--followings_output',
                        help='Output followings-input to file in destination')
    parser.add_argument(
        '--filename',
        '-f',
        help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help='Be quiet while scraping')
    parser.add_argument('--maximum',
                        '-m',
                        type=int,
                        default=0,
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--retain-username',
        '--retain_username',
        '-n',
        action='store_true',
        default=False,
        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-metadata',
                        '--media_metadata',
                        action='store_true',
                        default=False,
                        help='Save media metadata to json file')
    parser.add_argument('--profile-metadata',
                        '--profile_metadata',
                        action='store_true',
                        default=False,
                        help='Save profile metadata to json file')
    parser.add_argument('--proxies',
                        default={},
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--include-location',
        '--include_location',
        action='store_true',
        default=False,
        help='Include location data when saving media metadata')
    parser.add_argument('--media-types',
                        '--media_types',
                        '-t',
                        nargs='+',
                        default=['image', 'video', 'story'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest',
                        action='store_true',
                        default=False,
                        help='Scrape new media since the last scrape')
    parser.add_argument(
        '--latest-stamps',
        '--latest_stamps',
        default=None,
        help='Scrape new media since timestamps by user in specified file')
    parser.add_argument(
        '--cookiejar',
        '--cookierjar',
        default=None,
        help=
        'File in which to store cookies so that they can be reused between runs.'
    )
    parser.add_argument('--tag',
                        action='store_true',
                        default=False,
                        help='Scrape media using a hashtag')
    parser.add_argument('--filter',
                        default=None,
                        help='Filter by tags in user posts',
                        nargs='*')
    parser.add_argument('--location',
                        action='store_true',
                        default=False,
                        help='Scrape media using a location-id')
    parser.add_argument('--search-location',
                        action='store_true',
                        default=False,
                        help='Search for locations by name')
    parser.add_argument('--comments',
                        action='store_true',
                        default=False,
                        help='Save post comments to json file')
    parser.add_argument('--no-check-certificate',
                        action='store_true',
                        default=False,
                        help='Do not use ssl on transaction')
    parser.add_argument('--interactive',
                        '-i',
                        action='store_true',
                        default=False,
                        help='Enable interactive login challenge solving')
    parser.add_argument(
        '--retry-forever',
        action='store_true',
        default=False,
        help='Retry download attempts endlessly when errors are received')
    parser.add_argument('--verbose',
                        '-v',
                        type=int,
                        default=0,
                        help='Logging verbosity level')
    parser.add_argument('--template',
                        '-T',
                        type=str,
                        default='{urlname}',
                        help='Customize filename template')

    args = parser.parse_args()

    if (args.login_user
            and args.login_pass is None) or (args.login_user is None
                                             and args.login_pass):
        parser.print_help()
        raise ValueError('Must provide login user AND password')

    if not args.username and args.filename is None and not args.followings_input:
        parser.print_help()
        raise ValueError(
            'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input'
        )
    elif (args.username
          and args.filename) or (args.username and args.followings_input) or (
              args.filename and args.followings_input):
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input'
        )

    if args.tag and args.location:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: hashtag OR location')

    if args.tag and args.filter:
        parser.print_help()
        raise ValueError('Filters apply to user posts')

    if args.filename:
        args.usernames = InstagramScraper.parse_file_usernames(args.filename)
    else:
        args.usernames = InstagramScraper.parse_delimited_str(','.join(
            args.username))

    if args.media_types and len(
            args.media_types) == 1 and re.compile(r'[,;\s]+').findall(
                args.media_types[0]):
        args.media_types = InstagramScraper.parse_delimited_str(
            args.media_types[0])

    if args.retry_forever:
        global MAX_RETRIES
        MAX_RETRIES = sys.maxsize

    scraper = InstagramScraper(**vars(args))

    if args.login_user and args.login_pass:
        scraper.authenticate_with_login()
    else:
        scraper.authenticate_as_guest()

    if args.followings_input:
        scraper.usernames = list(
            scraper.query_followings_gen(scraper.login_user))
        if args.followings_output:
            with open(scraper.destination + scraper.followings_output,
                      'w') as file:
                for username in scraper.usernames:
                    file.write(username + "\n")
            # If not requesting anything else, exit
            if args.media_types == ['none'] and args.media_metadata is False:
                scraper.logout()
                return

    if args.tag:
        scraper.scrape_hashtag()
    elif args.location:
        scraper.scrape_location()
    elif args.search_location:
        scraper.search_locations()
    else:
        scraper.scrape()

    scraper.save_cookies()
    with open("imgurls.txt", "a", encoding="utf8") as f:
        f.write("{")
    number = args.limit
    if number == None or number == 0:
        number = 1000
    for username in args.usernames:
        org_path = username
        all_img = glob.glob(org_path + "/*.jpg")
        for org_img in all_img:
            outfile = org_img.replace(username + "\\", "")
            path = r'emoji'  # use your path
            all_emoji = glob.glob(path + "/*.png")
            all_count = number
            while all_count > 0:

                img = cv2.imread(org_img)
                img_cnt = random.randint(1, 6)
                height, width, channels = img.shape

                while img_cnt > 0:
                    id = randrange(33)
                    choose_emoji = all_emoji[id]
                    overlay_t = cv2.imread(choose_emoji, -1)
                    img = overlay_transparent(img, overlay_t,
                                              random.randint(0, width - 75),
                                              random.randint(0, height - 75),
                                              (75, 75))
                    img_cnt -= 1

                output = "result/" + str(outfile) + "-" + str(
                    all_count) + ".png"
                cv2.imwrite(output, img)
                all_count -= 1

            org_path = 'result'
            all_images = glob.glob(org_path + "/*.png")
            arr = np.array_split(all_images, 5)
            with open("imgurls.txt", "a", encoding="utf8") as f:
                f.write("{")
            with Pool(processes=5) as pool:
                pool.map(UploadingImage, arr)
            files = glob.glob('result/*')
            for f in files:
                os.remove(f)
            with open("imgurls.txt", 'rb+') as filehandle:
                filehandle.seek(-1, os.SEEK_END)
                filehandle.truncate()
            with open("imgurls.txt", "a", encoding="utf8") as f:
                f.write("}|")
            print("image done:" + org_img)

        with open("imgurls.txt", 'rb+') as filehandle:
            filehandle.seek(-1, os.SEEK_END)
            filehandle.truncate()
        with open("imgurls.txt", "a", encoding="utf8") as f:
            f.write("}")
コード例 #17
0
 def initInstagramScraper(self, user_name):
     args = {'username': user_name}
     self.scraper = InstagramScraper(**args)
コード例 #18
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "instagram-scraper scrapes and downloads an instagram user's photos and videos.",
        epilog=textwrap.dedent("""
        You can hide your credentials from the history, by reading your
        username from a local file:

        $ instagram-scraper @insta_args.txt user_to_scrape

        with insta_args.txt looking like this:
        -u=my_username
        -p=my_password

        You can add all arguments you want to that file, just remember to have
        one argument per line.

        """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username',
                        help='Instagram user(s) to scrape',
                        nargs='*')
    parser.add_argument('--destination',
                        '-d',
                        default='./',
                        help='Download destination')
    parser.add_argument('--login-user',
                        '--login_user',
                        '-u',
                        default=None,
                        help='Instagram login user')
    parser.add_argument('--login-pass',
                        '--login_pass',
                        '-p',
                        default=None,
                        help='Instagram login password')
    parser.add_argument('--login-only',
                        '--login_only',
                        '-l',
                        default=False,
                        action='store_true',
                        help='Disable anonymous fallback if login fails')
    parser.add_argument(
        '--filename',
        '-f',
        help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help='Be quiet while scraping')
    parser.add_argument('--maximum',
                        '-m',
                        type=int,
                        default=0,
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--retain-username',
        '--retain_username',
        '-n',
        action='store_true',
        default=False,
        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-metadata',
                        '--media_metadata',
                        action='store_true',
                        default=False,
                        help='Save media metadata to json file')
    parser.add_argument(
        '--include-location',
        '--include_location',
        action='store_true',
        default=False,
        help='Include location data when saving media metadata')
    parser.add_argument('--media-types',
                        '--media_types',
                        '-t',
                        nargs='+',
                        default=['image', 'video', 'story'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest',
                        action='store_true',
                        default=False,
                        help='Scrape new media since the last scrape')
    parser.add_argument(
        '--latest-stamps',
        '--latest_stamps',
        default=None,
        help='Scrape new media since timestamps by user in specified file')
    parser.add_argument('--tag',
                        action='store_true',
                        default=False,
                        help='Scrape media using a hashtag')
    parser.add_argument('--filter',
                        default=None,
                        help='Filter by tags in user posts',
                        nargs='*')
    parser.add_argument('--location',
                        action='store_true',
                        default=False,
                        help='Scrape media using a location-id')
    parser.add_argument('--search-location',
                        action='store_true',
                        default=False,
                        help='Search for locations by name')
    parser.add_argument('--comments',
                        action='store_true',
                        default=False,
                        help='Save post comments to json file')
    parser.add_argument('--interactive',
                        '-i',
                        action='store_true',
                        default=False,
                        help='Enable interactive login challenge solving')
    parser.add_argument(
        '--retry-forever',
        action='store_true',
        default=False,
        help='Retry download attempts endlessly when errors are received')
    parser.add_argument('--verbose',
                        '-v',
                        type=int,
                        default=0,
                        help='Logging verbosity level')

    args = parser.parse_args()

    if (args.login_user
            and args.login_pass is None) or (args.login_user is None
                                             and args.login_pass):
        parser.print_help()
        raise ValueError('Must provide login user AND password')

    if not args.username and args.filename is None:
        parser.print_help()
        raise ValueError(
            'Must provide username(s) OR a file containing a list of username(s)'
        )
    elif args.username and args.filename:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: username(s) OR a filename containing username(s)'
        )

    if args.tag and args.location:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: hashtag OR location')

    if args.tag and args.filter:
        parser.print_help()
        raise ValueError('Filters apply to user posts')

    if args.filename:
        args.usernames = InstagramScraper.parse_file_usernames(args.filename)
    else:
        args.usernames = InstagramScraper.parse_delimited_str(','.join(
            args.username))

    if args.media_types and len(
            args.media_types) == 1 and re.compile(r'[,;\s]+').findall(
                args.media_types[0]):
        args.media_types = InstagramScraper.parse_delimited_str(
            args.media_types[0])

    if args.retry_forever:
        global MAX_RETRIES
        MAX_RETRIES = sys.maxsize

    scraper = InstagramScraper(**vars(args))

    # if args.tag:
    #     scraper.scrape_hashtag()
    # elif args.location:
    #     scraper.scrape_location()
    # elif args.search_location:
    #     scraper.search_locations()
    # else:
    #     scraper.scrape()
    get_simple_summary(scraper)
コード例 #19
0
class InstagramTests(unittest.TestCase):
    def setUp(self):
        fixtures_path = os.path.join(os.path.dirname(__file__), 'fixtures')
        self.response_user_metadata = open(
            os.path.join(fixtures_path, 'response_user_metadata.json')).read()
        self.response_first_page = open(
            os.path.join(fixtures_path, 'response_first_page.json')).read()
        self.response_second_page = open(
            os.path.join(fixtures_path, 'response_second_page.json')).read()

        self.response_explore_tags = open(
            os.path.join(fixtures_path, 'response_explore_tags.json')).read()

        self.response_query_hashtag_first_page = open(
            os.path.join(fixtures_path,
                         'response_query_hashtag_first_page.json')).read()

        self.response_query_hashtag_second_page = open(
            os.path.join(fixtures_path,
                         'response_query_hashtag_second_page.json')).read()

        self.response_view_media_video = open(
            os.path.join(fixtures_path,
                         'response_view_media_video.json')).read()

        # This is a max id of the last item in response_first_page.json.
        self.max_id = "1369793132326237681_50955533"

        self.test_dir = tempfile.mkdtemp()

        args = {
            'usernames': ['test'],
            'destination': self.test_dir,
            'login_user': None,
            'login_pass': None,
            'quiet': True,
            'maximum': 0,
            'retain_username': False,
            'media_metadata': False,
            'media_types': ['image', 'video', 'story'],
            'latest': False
        }

        self.scraper = InstagramScraper(**args)

    def tearDown(self):
        shutil.rmtree(self.test_dir)

    def test_scrape(self):
        with requests_mock.Mocker() as m:
            m.get(BASE_URL + self.scraper.usernames[0],
                  text=self.response_user_metadata)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]),
                  text=self.response_first_page)
            m.get(MEDIA_URL.format(self.scraper.usernames[0]) + '?max_id=' +
                  self.max_id,
                  text=self.response_second_page)
            m.get('https://fake-url.com/photo1.jpg', text="image1")
            m.get('https://fake-url.com/photo2.jpg', text="image2")
            m.get('https://fake-url.com/photo3.jpg', text="image3")

            self.scraper.scrape()

            # First page has photo1 and photo2, while second page has photo3. If photo3
            # is opened, generator successfully traversed both pages.
            self.assertEqual(
                open(os.path.join(self.test_dir, 'photo3.jpg')).read(),
                "image3")

    def test_scrape_hashtag(self):
        with requests_mock.Mocker() as m:
            m.get(TAGS_URL.format('test'),
                  text=self.response_explore_tags,
                  cookies={'csrftoken': 'token'})
            m.post(QUERY_URL, [{
                'text': self.response_query_hashtag_first_page,
                'status_code': 200
            }, {
                'text': self.response_query_hashtag_second_page,
                'status_code': 200
            }])
            m.get(VIEW_MEDIA_URL.format('code4'),
                  text=self.response_view_media_video)
            m.get('https://fake-url.com/video.mp4', text="video")

            self.scraper.scrape_hashtag()

            self.assertEqual(
                open(os.path.join(self.test_dir, 'video.mp4')).read(), "video")
コード例 #20
0
args = {
    'usernames': [userName],
    'destination': userName,
    'login-user': login_user,
    'login-pass': login_pass,
    'quiet': True,
    'maximum': 0,
    'retain_username': False,
    'media_metadata': True,
    'media_types': ['image'],
    'latest': False,
    'profile_metadata': True
}

scraper = InstagramScraper(**args)
scraper.scrape()

with open(userName + "\\" + userName + ".json", encoding="utf8") as f:
    data = json.load(f)

document = Document()

document.add_heading('@' + userName, 0)
document.add_heading(data["GraphProfileInfo"]["info"]["biography"], 1)

cnt = 0
for i in range(len(data["GraphImages"])):
    if data["GraphImages"][i]["__typename"] == "GraphImage" or data[
            "GraphImages"][i]["__typename"] == "GraphSidecar":
        cnt += 1
コード例 #21
0
def scrape_photos(sourceUserFolder):
    parser = argparse.ArgumentParser(
        description=
        "instagram-scraper scrapes and downloads an instagram user's photos and videos.",
        epilog=textwrap.dedent("""
        You can hide your credentials from the history, by reading your
        username from a local file:
        $ instagram-scraper @insta_args.txt user_to_scrape
        with insta_args.txt looking like this:
        -u=my_username
        -p=my_password
        You can add all arguments you want to that file, just remember to have
        one argument per line.
        Customize filename:
        by adding option --template or -T
        Default is: {urlname}
        And there are some option:
        {username}: Instagram user(s) to scrape.
        {shortcode}: post shortcode, but profile_pic and story are none.
        {urlname}: filename form url.
        {mediatype}: type of media.
        {datetime}: date and time that photo/video post on,
                     format is: 20180101 01h01m01s
        {date}: date that photo/video post on,
                 format is: 20180101
        {year}: format is: 2018
        {month}: format is: 01-12
        {day}: format is: 01-31
        {h}: hour, format is: 00-23h
        {m}: minute, format is 00-59m
        {s}: second, format is 00-59s
        """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username',
                        help='Instagram user(s) to scrape',
                        nargs='*')
    parser.add_argument('--destination',
                        '-d',
                        default=sourceUserFolder,
                        help='Download destination')
    parser.add_argument('--login-user',
                        '--login_user',
                        '-u',
                        default=None,
                        help='Instagram login user')
    parser.add_argument('--login-pass',
                        '--login_pass',
                        '-p',
                        default=None,
                        help='Instagram login password')
    parser.add_argument(
        '--followings-input',
        '--followings_input',
        action='store_true',
        default=False,
        help='Compile list of profiles followed by login-user to use as input')
    parser.add_argument('--followings-output',
                        '--followings_output',
                        help='Output followings-input to file in destination')
    parser.add_argument(
        '--filename',
        '-f',
        help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help='Be quiet while scraping')
    parser.add_argument('--maximum',
                        '-m',
                        type=int,
                        default=0,
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--retain-username',
        '--retain_username',
        '-n',
        action='store_true',
        default=False,
        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-metadata',
                        '--media_metadata',
                        action='store_true',
                        default=False,
                        help='Save media metadata to json file')
    parser.add_argument('--profile-metadata',
                        '--profile_metadata',
                        action='store_true',
                        default=False,
                        help='Save profile metadata to json file')
    parser.add_argument(
        '--proxies',
        default={},
        help=
        'Enable use of proxies, add a valid JSON with http or/and https urls.')
    parser.add_argument(
        '--include-location',
        '--include_location',
        action='store_true',
        default=False,
        help='Include location data when saving media metadata')
    parser.add_argument('--media-types',
                        '--media_types',
                        '-t',
                        nargs='+',
                        default=['image', 'video', 'story'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest',
                        action='store_true',
                        default=False,
                        help='Scrape new media since the last scrape')
    parser.add_argument(
        '--latest-stamps',
        '--latest_stamps',
        default=None,
        help='Scrape new media since timestamps by user in specified file')
    parser.add_argument(
        '--cookiejar',
        '--cookierjar',
        default=None,
        help=
        'File in which to store cookies so that they can be reused between runs.'
    )
    parser.add_argument('--tag',
                        action='store_true',
                        default=False,
                        help='Scrape media using a hashtag')
    parser.add_argument('--filter',
                        default=None,
                        help='Filter by tags in user posts',
                        nargs='*')
    parser.add_argument(
        '--filter_location',
        default=None,
        nargs="*",
        help=
        "filter query by only accepting media with location filter as the location id"
    )
    parser.add_argument(
        '--filter_location_file',
        default=None,
        type=str,
        help="file containing list of locations to filter query by")
    parser.add_argument('--location',
                        action='store_true',
                        default=False,
                        help='Scrape media using a location-id')
    parser.add_argument('--search-location',
                        action='store_true',
                        default=False,
                        help='Search for locations by name')
    parser.add_argument('--comments',
                        action='store_true',
                        default=False,
                        help='Save post comments to json file')
    parser.add_argument('--no-check-certificate',
                        action='store_true',
                        default=False,
                        help='Do not use ssl on transaction')
    parser.add_argument('--interactive',
                        '-i',
                        action='store_true',
                        default=False,
                        help='Enable interactive login challenge solving')
    parser.add_argument(
        '--retry-forever',
        action='store_true',
        default=False,
        help='Retry download attempts endlessly when errors are received')
    parser.add_argument('--verbose',
                        '-v',
                        type=int,
                        default=0,
                        help='Logging verbosity level')
    parser.add_argument('--template',
                        '-T',
                        type=str,
                        default='{urlname}',
                        help='Customize filename template')
    parser.add_argument(
        '--log_destination',
        '-l',
        type=str,
        default='',
        help='destination folder for the instagram-scraper.log file')

    args = parser.parse_args()

    # Need to set destination folder

    if (args.login_user
            and args.login_pass is None) or (args.login_user is None
                                             and args.login_pass):
        parser.print_help()
        raise ValueError('Must provide login user AND password')

    if not args.username and args.filename is None and not args.followings_input:
        parser.print_help()
        raise ValueError(
            'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input'
        )
    elif (args.username
          and args.filename) or (args.username and args.followings_input) or (
              args.filename and args.followings_input):
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input'
        )

    if args.tag and args.location:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: hashtag OR location')

    if args.tag and args.filter:
        parser.print_help()
        raise ValueError('Filters apply to user posts')

    if (args.filter_location
            or args.filter_location_file) and not args.include_location:
        parser.print_help()
        raise ValueError(
            'Location filter needs locations in metadata to filter properly')

    if args.filename:
        args.usernames = InstagramScraper.get_values_from_file(args.filename)
    else:
        args.usernames = InstagramScraper.parse_delimited_str(','.join(
            args.username))

    if args.filter_location_file:
        args.filter_locations = InstagramScraper.get_values_from_file(
            args.filter_location_file)
    elif args.filter_location:
        args.filter_locations = InstagramScraper.parse_delimited_str(','.join(
            args.filter_location))

    if args.media_types and len(
            args.media_types) == 1 and re.compile(r'[,;\s]+').findall(
                args.media_types[0]):
        args.media_types = InstagramScraper.parse_delimited_str(
            args.media_types[0])

    if args.retry_forever:
        global MAX_RETRIES
        MAX_RETRIES = sys.maxsize

    scraper = InstagramScraper(**vars(args))

    if args.login_user and args.login_pass:
        scraper.authenticate_with_login()
    else:
        scraper.authenticate_as_guest()

    if args.followings_input:
        scraper.usernames = list(
            scraper.query_followings_gen(scraper.login_user))
        if args.followings_output:
            with open(scraper.destination + scraper.followings_output,
                      'w') as file:
                for username in scraper.usernames:
                    file.write(username + "\n")
            # If not requesting anything else, exit
            if args.media_types == ['none'] and args.media_metadata is False:
                scraper.logout()
                return

    if args.tag:
        scraper.scrape_hashtag()
    elif args.location:
        scraper.scrape_location()
    elif args.search_location:
        scraper.search_locations()
    else:
        scraper.scrape()

    scraper.save_cookies()