Ejemplo n.º 1
0
    def worker(payload):
        http, item, url = payload

        if url is None:
            return FetchWorkerResult(url=None,
                                     item=item,
                                     response=None,
                                     error=None,
                                     meta=None)

        kwargs = request_args(url, item) if request_args is not None else {}

        error, response = request(http, url, **kwargs)

        if error:
            return FetchWorkerResult(url=url,
                                     item=item,
                                     response=response,
                                     error=error,
                                     meta=None)

        # Forcing urllib3 to read data in thread
        data = response.data

        # Meta
        meta = extract_response_meta(response,
                                     guess_encoding=guess_encoding,
                                     guess_extension=guess_extension)

        return FetchWorkerResult(url=url,
                                 item=item,
                                 response=response,
                                 error=error,
                                 meta=meta)
Ejemplo n.º 2
0
    def request_page(target):
        error, result = request(http, target, cookie=cookie)

        if error is not None:
            raise error

        return result.data.decode('utf-8')
Ejemplo n.º 3
0
def step(http, url, item_key):
    err, result = request(http, url)

    # Debug
    if err:
        raise err

    # Bad auth
    if result.status == 401:
        raise CrowdTangleInvalidTokenError

    # Bad params
    if result.status >= 400:
        raise CrowdTangleInvalidRequestError

    try:
        data = json.loads(result.data)['result']
    except:
        raise CrowdTangleInvalidJSONError

    if item_key not in data or len(data[item_key]) == 0:
        raise CrowdTangleExhaustedPagination

    # Extracting next link
    pagination = data['pagination']
    next_page = pagination['nextPage'] if 'nextPage' in pagination else None

    return data[item_key], next_page
Ejemplo n.º 4
0
    def request_page(self, url):
        error, result = request(self.http,
                                url,
                                cookie=self.cookie,
                                headers={'User-Agent': 'curl/7.68.0'})

        if error is not None:
            raise error

        return result.data.decode('utf-8')
Ejemplo n.º 5
0
def make_request(http, url):
    err, response = request(http, forge_url(url), headers={'Accept-Language': 'en'})

    if response.status == 404:
        return 'not-found', None

    if err:
        return 'http-error', None

    return err, response.data
Ejemplo n.º 6
0
    def work(self, job):
        self.state.jobs_queued = self.queue.qsize()

        spider = self.spiders.get(job.spider)

        if spider is None:
            raise UnknownSpiderError('Unknown spider "%s"' % job.spider)

        err, response = request(self.http, job.url)

        if err:
            return CrawlWorkerResult(job=job,
                                     scraped=None,
                                     error=err,
                                     response=response,
                                     meta=None,
                                     content=None,
                                     next_jobs=None)

        meta = spider.extract_meta_from_response(job, response)

        # Decoding response content
        content = spider.process_content(job, response, meta)

        if isinstance(spider, FunctionSpider):
            next_jobs, scraped = spider.process(job, response, content, meta)
        else:

            # Scraping items
            scraped = spider.scrape(job, response, content, meta)

            # Finding next jobs
            next_jobs = spider.next_jobs(job, response, content, meta)

        # Enqueuing next jobs
        if next_jobs is not None:

            # Consuming so that multiple agents may act on this
            next_jobs = list(next_jobs)
            self.enqueue(next_jobs)

        self.state.jobs_done += 1

        return CrawlWorkerResult(job=job,
                                 scraped=scraped,
                                 error=None,
                                 response=response,
                                 meta=meta,
                                 content=content,
                                 next_jobs=next_jobs)
Ejemplo n.º 7
0
def step(http, url, item_key):
    err, result = request(http, url)

    # Debug
    if err:
        raise err

    # Bad auth
    if result.status == 401:
        raise CrowdTangleInvalidTokenError

    elif result.status == 429:
        raise CrowdTangleRateLimitExceeded

    # Bad params
    if result.status >= 400:
        data = json.loads(result.data.decode('utf-8'))
        raise CrowdTangleInvalidRequestError(data['message'],
                                             code=data['code'],
                                             status=result.status)

    try:
        data = json.loads(result.data)['result']
    except:
        raise CrowdTangleInvalidJSONError

    items = None

    if item_key in data:
        items = data[item_key]

        if len(items) == 0:
            items = None

    # Extracting next link
    pagination = data['pagination']
    next_page = pagination['nextPage'] if 'nextPage' in pagination else None

    return items, next_page
Ejemplo n.º 8
0
def captions_action(namespace, output_file):

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for line, video_id in enricher.cells(namespace.column, with_rows=True):
        url_caption = ''
        url_inf = INFO_URL_TEMPLATE % {'id': video_id}
        err1, info_vid = request(http, url_inf)
        info_vid_dec = unquote(str(info_vid.data))
        captionsTracks = re.findall(get_info, info_vid_dec)
        if captionsTracks:
            dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks']
            for i in range(len(dict_captions)):
                if namespace.lang and namespace.lang == dict_captions[i]['languageCode']:
                    url_caption = dict_captions[i]['baseUrl']
                    break
            if not(url_caption) and dict_captions:
                url_caption = dict_captions[0]['baseUrl']

        else:
            url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id}
            urls = []
            time.sleep(0.01)
            err, result = request(http, url_vid)
            timedtext = re.findall(timed, str(result.data))
            for x in timedtext:
                proper_timed = x.replace("\\\\u0026", "&")
                if proper_timed[-2:] == namespace.lang:
                    url_caption = API_BASE_URL % {'temp': proper_timed}
                    break
            if not(url_caption) and timedtext and not(namespace.lang):
                url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")}
        if not url_caption:
            print_err('no subtitles for {}'.format(video_id))
            continue

        time.sleep(0.01)
        err, result_caption = request(http, url_caption)

        if err is not None:
            print_err(err)
        elif result_caption.status >= 400:
            print_err(f'error, status : {result_caption.status} for id : {video_id}')
            enricher.writerow(line)
        else:
            soup = BeautifulSoup(result_caption.data, 'lxml')

            full_text = []

            caption_text = " ".join(item.get_text() for item in soup.find_all('text'))

            enricher.writerow(line, [caption_text])

        loading_bar.update()
Ejemplo n.º 9
0
    def fetch_facebook_page_stats(url):
        err, response = request(http, url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = nested_get([
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ], data)

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data
Ejemplo n.º 10
0
    def test_bad_protocol(self):
        http = create_pool()
        err, _ = request(http, 'ttps://lemonde.fr')

        assert type(err) is InvalidURLError
Ejemplo n.º 11
0
 def request(self, url):
     return request(self.http, url, spoof_ua=True)