Ejemplo n.º 1
0
    def __init__(self,
                 driver_path,
                 cookies: dict,
                 db: Database = None,
                 path: Union[PathGenerator, str] = None,
                 proxies: dict = None,
                 timeout: int = 15,
                 no_window: bool = False,
                 logger=None):

        options = {
            'arguments': ['--headless', '--window-size=1920,1080']
        } if no_window else {
            'arguments': ['--start-maximized']
        }

        # https://chromedriver.chromium.org/downloads
        self.session = requestium.Session(webdriver_path=driver_path,
                                          browser='chrome',
                                          default_timeout=timeout,
                                          webdriver_options=options)

        for key, value in cookies.items():
            self.session.driver.ensure_add_cookie({
                'name': key,
                'value': value,
                'domain': '.instagram.com'
            })
        self.session.transfer_driver_cookies_to_session()
        self.session.proxies = proxies
        self.session.headers = {
            'user-agent':
            self.session.driver.execute_script("return navigator.userAgent;")
        }
        self.session.default_timeout = timeout

        self.db = MongoDB('instagram',
                          primary_key='link') if db is None else db

        if path is None:
            self.path = StoreByUserName('./download')
        elif path is str:
            self.path = StoreByUserName(path)
        else:
            self.path = path

        self.pattern = {
            'content':
            re.compile(r'("display_url"|"display_src"|"video_url"):"(.+?)"'),
            'owner':
            re.compile(r'"owner":({.+?})'),
            'username':
            re.compile(r'"username":"******"')
        }

        self.logger = Log.create_logger(
            'InstagramSpider', './instagram.log') if logger is None else logger

        atexit.register(self.quit)
Ejemplo n.º 2
0
    def __init__(self, db: Database = None,
                 path: PathGenerator = None,
                 session: Session = None,
                 auth: Auth = None):
        self.db = MongoDB('weibo', primary_key='id') if db is None else db
        self.path = StoreByUserName('./download') if path is None else path
        self.session = Session(timeout=10, retry=5) \
            if session is None else session

        if auth is None:
            auth = Auth()
        self.token = auth.token.token
        self.client = Client()
Ejemplo n.º 3
0
class WeiboSpider:

    def __init__(self, db: Database = None,
                 path: PathGenerator = None,
                 session: Session = None,
                 auth: Auth = None):
        self.db = MongoDB('weibo', primary_key='id') if db is None else db
        self.path = StoreByUserName('./download') if path is None else path
        self.session = Session(timeout=10, retry=5) \
            if session is None else session

        if auth is None:
            auth = Auth()
        self.token = auth.token.token
        self.client = Client()

    def list(self, page=1):
        items = []
        running = True
        while running:
            data = self.client.favorites.get(access_token=self.token, page=page)
            if len(data.favorites) <= 0:
                break
            for item in data.favorites:
                if item.status.id not in self.db:
                    items.append(item.status)
                else:
                    running = False
                    break
            page += 1
        items.reverse()
        return items

    def download(self, status):
        if 'deleted' not in status:
            user = status.user.name
            for item in status.pic_urls:
                url = item.thumbnail_pic.replace('thumbnail', 'large')
                path = self.path.generate(user_name=user,
                                          media_type=MediaType.image)
                r = self.session.get(url)
                with open(path, 'wb') as f:
                    f.write(r.content)
        self.db.add(status.id)
Ejemplo n.º 4
0
 def start_crawl(self, extractor: Extractor, redis: RedisSet,
                 mongo: MongoDB):
     while not redis.empty():
         movie_id = redis.pop()
         self.logger.info('Movie ID: {}'.format(movie_id))
         try:
             info = self._crawl(movie_id, extractor)
             if info is not None:
                 if mongo.count({'id': movie_id}) <= 0:
                     mongo.insert(info)
                 else:
                     self.logger.info(
                         'Duplicate record {}'.format(movie_id))
             else:
                 self.logger.warning('Useless record {}'.format(movie_id))
         except NetworkException as e:
             self.logger.error(e)
             redis.add(movie_id)
         time.sleep(10)
Ejemplo n.º 5
0
from instagramspider import CookieReader, InstagramSpider
from spiderutil.connector import MongoDB
from spiderutil.log import Log

if __name__ == '__main__':
    # 1. Get cookie from instagram.com after you logged in
    cookies = CookieReader.from_local_file('./cookie.txt')
    # 2. Use MongoDB to save links
    db = MongoDB('instagram', primary_key='link')
    db.check_connection()
    # 3. Declare the spider, you need to specify:
    # the location of chromedriver, the cookies, the proxies (if necessary),
    # the database, and the path generator
    spider = InstagramSpider(driver_path='./chromedriver.exe',
                             cookies=cookies,
                             proxies={'https': 'http://127.0.0.1:1080'},
                             db=db)
    # Use logger to log
    logger = Log.create_logger(name='InstagramSpider', path='./instagram.log')
    # 4. Get links from saved, will stop if met duplicate link in the database
    links = spider.get_saved_list('<Your Username>')
    logger.info('Total: {}'.format(len(links)))
    # 5. Download the links
    for link in links:
        logger.info(link)
        count = spider.download(link)
        logger.info(count)
Ejemplo n.º 6
0
class InstagramSpider:
    def __init__(self,
                 driver_path,
                 cookies: dict,
                 db: Database = None,
                 path: Union[PathGenerator, str] = None,
                 proxies: dict = None,
                 timeout: int = 15,
                 no_window: bool = False,
                 logger=None):

        options = {
            'arguments': ['--headless', '--window-size=1920,1080']
        } if no_window else {
            'arguments': ['--start-maximized']
        }

        # https://chromedriver.chromium.org/downloads
        self.session = requestium.Session(webdriver_path=driver_path,
                                          browser='chrome',
                                          default_timeout=timeout,
                                          webdriver_options=options)

        for key, value in cookies.items():
            self.session.driver.ensure_add_cookie({
                'name': key,
                'value': value,
                'domain': '.instagram.com'
            })
        self.session.transfer_driver_cookies_to_session()
        self.session.proxies = proxies
        self.session.headers = {
            'user-agent':
            self.session.driver.execute_script("return navigator.userAgent;")
        }
        self.session.default_timeout = timeout

        self.db = MongoDB('instagram',
                          primary_key='link') if db is None else db

        if path is None:
            self.path = StoreByUserName('./download')
        elif path is str:
            self.path = StoreByUserName(path)
        else:
            self.path = path

        self.pattern = {
            'content':
            re.compile(r'("display_url"|"display_src"|"video_url"):"(.+?)"'),
            'owner':
            re.compile(r'"owner":({.+?})'),
            'username':
            re.compile(r'"username":"******"')
        }

        self.logger = Log.create_logger(
            'InstagramSpider', './instagram.log') if logger is None else logger

        atexit.register(self.quit)

    @property
    def driver(self):
        return self.session.driver

    def get_saved_list(self, user):
        url = 'https://www.instagram.com/{}/saved/'.format(user)
        links = []
        end = False
        self.session.driver.get(url)

        def extract_list():
            self.session.driver.ensure_element_by_xpath(
                '//*[@id="react-root"]/section/main/div/div[3]/article/div[1]/div/div[last()]'
            )
            article = self.session.driver.find_element_by_tag_name('article')
            photos = article.find_elements_by_tag_name('a')
            for photo in photos:
                link = photo.get_property('href')
                if link in self.db:
                    nonlocal end
                    end = True
                    break
                if link not in links:
                    links.append(link)

        extract_list()
        while not end:
            self.session.driver.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            extract_list()

        links.reverse()
        return links

    def download(self, link: str):
        while True:
            try:
                r = self.session.get(link)
                break
            except Exception as e:
                self.logger.error(e)
        soup = Soup(r.text, 'lxml')
        try:
            page = soup.find('body').text
            res = self.pattern['content'].findall(page)
            contents = []
            for item in res:
                img_link = item[1].replace('\\u0026', '&').replace('\\', '')
                if img_link not in contents:
                    contents.append(img_link)
            owner_str = self.pattern['owner'].findall(page)[-1]
            username = self.pattern['username'].findall(owner_str)[-1]
            for content in contents:
                while True:
                    try:
                        r = self.session.get(content)
                        break
                    except Exception as e:
                        self.logger.error(e)
                media_type = MediaType.video if 'video' in r.headers[
                    'Content-Type'] else MediaType.image
                with open(
                        self.path.generate(user_name=username,
                                           media_type=media_type), 'wb') as f:
                    f.write(r.content)
            self.db.insert({'link': link})
            return len(contents)
        except IndexError as e:
            self.logger.error(e)

    def quit(self):
        self.session.driver.quit()

    def __del__(self):
        self.quit()
Ejemplo n.º 7
0
from time import sleep

from juhe.spider import JuheSpider
from spiderutil.connector import MongoDB


if __name__ == '__main__':
    spider = JuheSpider()

    mongo = MongoDB('juhe-api')
    for link, data in spider.crawl_api():
        print(data['标题'], link)
        mongo.insert(data)
        sleep(5)

    mongo = MongoDB('juhe-data')
    for link, data in spider.crawl_data():
        print(data['标题'], link)
        mongo.insert(data)
        sleep(5)
Ejemplo n.º 8
0
    # Init a spider to get tweet objects
    spider = TwitterSpider(token, proxies=proxies)

    # Init a downloader to download tweet images and videos
    downloader = TwitterDownloader(StoreByUserName('./download'),
                                   proxies=proxies)

    # Init a logger if you want to print logs in the main function
    # (Loggers are automatically enabled in spider and download,
    # however, you can replace it with customized one)
    logger = Log.create_logger('TwitterSpider', './twitter.log')

    # Init the mongoDB to persist data,
    # check the connection and drop data in former session
    mongo = MongoDB('Twitter')
    mongo.check_connection()
    mongo.drop()

    # Save failed tweets into another collection
    failed = MongoDB('Twitter-Failed')

    # Use mongoDB to save checkpoint
    since_id = None
    checkpoint = MongoDB('Checkpoint')
    item = checkpoint.find({'name': 'Twitter'})
    if item is None:
        # If there is no checkpoint you need to manually add one
        checkpoint.insert({'name': 'Twitter', 'id': 0})
    else:
        since_id = item['id']
Ejemplo n.º 9
0
from spiderutil.connector import RedisSet, MongoDB

from cbooo.spider import CboooSpider
from cbooo.extractor import Extractor

if __name__ == '__main__':
    spider = CboooSpider()
    ext = Extractor()
    redis = RedisSet('cbooo')
    redis.check_connection()
    mongo = MongoDB('cbooo')
    mongo.check_connection()

    spider.get_id(redis)
    spider.start_crawl(ext, redis, mongo)
Ejemplo n.º 10
0
        'http': 'http://127.0.0.1:1080',
        'https': 'http://127.0.0.1:1080'
    }

    # Init a spider to get tweet objects
    spider = TwitterSpider(token, proxies=proxies)

    # Init a downloader to download tweet images and videos
    downloader = TwitterDownloader(StoreByUserName('./download'),
                                   proxies=proxies)

    # Init a logger if you want to print logs in the main function
    logger = Log.create_logger('TwitterSpider', './twitter.log')

    # Init the mongoDB to persist data,
    mongo = MongoDB('Twitter')
    # Check the connection and drop data in former session
    mongo.check_connection()
    mongo.drop()

    # Save failed tweets into another collection
    failed = MongoDB('Twitter-Failed')

    # Use local file to save checkpoint
    checkpoint = Checkpoint.load_file('./checkpoint.txt')
    since_id = checkpoint.tweet_id

    # Crawl the timeline and save to mongoDB
    # `screen_name` is the nickname of a user
    for tweet in spider.crawl_timeline(screen_name='twitter', since_id=since_id):
        # If you don't have mongoDB, you can use `downloader.download` download it directly
Ejemplo n.º 11
0
from time import sleep

from aliyun.spider import AliyunSpider
from spiderutil.connector import MongoDB

if __name__ == '__main__':
    spider = AliyunSpider()
    mongo = MongoDB('aliyun')
    for data in spider.crawl_list():
        mongo.insert(data)
        sleep(5)
Ejemplo n.º 12
0
class InstagramSpider:
    def __init__(self,
                 driver_path,
                 cookies: dict,
                 db: Database = None,
                 path: Union[PathGenerator, str] = None,
                 proxies: dict = None,
                 timeout: int = 15,
                 no_window: bool = False,
                 logger=None):

        options = {
            'arguments': ['--headless', '--window-size=1920,1080']
        } if no_window else {
            'arguments': ['--start-maximized']
        }

        # https://chromedriver.chromium.org/downloads
        self.session = requestium.Session(webdriver_path=driver_path,
                                          browser='chrome',
                                          default_timeout=timeout,
                                          webdriver_options=options)

        for key, value in cookies.items():
            self.session.driver.ensure_add_cookie({
                'name': key,
                'value': value,
                'domain': '.instagram.com'
            })
        self.session.transfer_driver_cookies_to_session()
        self.session.proxies = proxies
        self.session.headers = {
            'user-agent':
            self.session.driver.execute_script("return navigator.userAgent;")
        }
        self.session.default_timeout = timeout

        self.db = MongoDB('instagram',
                          primary_key='link') if db is None else db

        if path is None:
            self.path = StoreByUserName('./download')
        elif path is str:
            self.path = StoreByUserName(path)
        else:
            self.path = path

        self.pattern = {
            'prefix':
            re.compile(
                r'<script type="text/javascript">window.__additionalDataLoaded\('
            ),
            'start':
            re.compile(r'{"items":'),
            'suffix':
            re.compile(r'\);</script><script type="text/javascript">')
        }

        self.logger = Log.create_logger(
            'InstagramSpider', './instagram.log') if logger is None else logger

        atexit.register(self.quit)

    @property
    def driver(self):
        return self.session.driver

    def get_saved_list(self, user):
        url = 'https://www.instagram.com/{}/saved/'.format(user)
        links = []
        end = False
        self.session.driver.get(url)

        def extract_list():
            self.session.driver.ensure_element_by_xpath(
                '//*[@id="react-root"]/section/main/div/div[3]/article/div[1]/div/div[last()]'
            )
            article = self.session.driver.find_element_by_tag_name('article')
            photos = article.find_elements_by_tag_name('a')
            for photo in photos:
                link = photo.get_property('href')
                if link in self.db:
                    nonlocal end
                    end = True
                    break
                if link not in links:
                    links.append(link)

        extract_list()
        while not end:
            self.session.driver.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            extract_list()

        links.reverse()
        return links

    def download(self, link: str):
        while True:
            try:
                r = self.session.get(link)
                break
            except Exception as e:
                self.logger.error(e)
        soup = Soup(r.text, 'lxml')
        try:
            page = str(soup.find('body'))

            # Extract json content from the page
            prefix_match = self.pattern['prefix'].search(page)
            stripped_str = page[prefix_match.end():]
            start_match = self.pattern['start'].search(stripped_str)
            suffix_match = self.pattern['suffix'].search(stripped_str)
            content = stripped_str[start_match.start():suffix_match.start()]

            total_count = 0

            for data in json.loads(content)['items']:

                # Get username
                contents = []
                user_name = data['user']['username']
                media_type = data['media_type']

                if media_type == 8:
                    if 'carousel_media' in data:
                        count = 0
                        for item in data['carousel_media']:
                            count += 1
                            if 'video_versions' in item:
                                # Here we assume that the first one is the original one
                                contents.append(
                                    (item['video_versions'][0]['url'],
                                     MediaType.video))
                            elif 'image_versions2' in item:
                                contents.append(
                                    (item['image_versions2']['candidates'][0]
                                     ['url'], MediaType.image))
                            else:
                                raise ValueError(
                                    'No available content found in carousel media.'
                                )
                        if 'carousel_media_count' not in data or data[
                                'carousel_media_count'] != count:
                            raise ValueError(
                                'The count of media is not equal, expected: {}, actual: {}'
                                .format(data['carousel_media_count'], count))
                    else:
                        raise ValueError(
                            'No available content found in carousel media.')
                elif media_type == 2:
                    if 'video_versions' in data:
                        contents.append((data['video_versions'][0]['url'],
                                         MediaType.video))
                    else:
                        raise ValueError(
                            'No available video found in media type 2.')
                elif media_type == 1:
                    if 'image_versions2' in data:
                        contents.append(
                            (data['image_versions2']['candidates'][0]['url'],
                             MediaType.image))
                    else:
                        raise ValueError(
                            'No available image found in media type 1.')
                else:
                    raise ValueError(
                        'Unknown media type: {}'.format(media_type))

                for content in contents:
                    self._download_content(content[0], user_name, content[1])

                total_count += len(contents)

            self.db.insert({'link': link})
            return total_count
        except IndexError as e:
            self.logger.error(e)

    def _download_content(self, url, user_name, media_type):
        while True:
            try:
                r = self.session.get(url)
                break
            except Exception as e:
                self.logger.error(e)
        with open(
                self.path.generate(user_name=user_name, media_type=media_type),
                'wb') as f:
            f.write(r.content)

    def quit(self):
        self.session.driver.quit()

    def __del__(self):
        self.quit()
Ejemplo n.º 13
0
from time import sleep

from gbdex.spider import GBDEXSpider

from spiderutil.connector import MongoDB

if __name__ == '__main__':
    spider = GBDEXSpider()

    mongo = MongoDB('gbdex-api')
    for api in spider.crawl_api():
        mongo.insert(api)
        sleep(2)

    mongo = MongoDB('gbdex-data')
    for item in spider.crawl_data_file():
        mongo.insert(item)
        sleep(2)