def test_basics(self): for url, result in DEFAULT_TESTS: assert is_url(url) == result for url, result in NO_PROTOCOL_TESTS: assert is_url(url, require_protocol=False) == result for url, result in TLD_AWARE_TESTS: assert is_url(url, require_protocol=False, tld_aware=True) == result for url, result in RELAXED_TESTS: assert is_url(url, require_protocol=False, allow_spaces_in_path=True) == result for url, result in ONLY_HTTP_HTTPS_TESTS: assert is_url(url, only_http_https=True) == result
def raw_request(http, url, method='GET', headers=None, preload_content=True, release_conn=True, timeout=None, body=None): """ Generic request helpers using a urllib3 pool to access some resource. """ # Validating URL if not ural.is_url(url, require_protocol=True, tld_aware=True, allow_spaces_in_path=True): return InvalidURLError(url=url), None # Performing request request_kwargs = { 'headers': headers, 'body': body, 'preload_content': preload_content, 'release_conn': release_conn, 'redirect': False, 'retries': False } if timeout is not None: request_kwargs['timeout'] = timeout try: response = http.request( method, url, **request_kwargs ) except Exception as e: return e, None return None, response
def crowdtangle_summary_action(namespace, output_file): if not namespace.start_date: die('Missing --start-date!') if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select.split(',') if namespace.select else None, add=CROWDTANGLE_SUMMARY_CSV_HEADERS) posts_writer = None if namespace.posts is not None: posts_writer = csv.writer(namespace.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = tqdm(desc='Collecting data', dynamic_ncols=True, total=namespace.total, unit=' urls') client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() try: stats = client.summary(url, start_date=namespace.start_date, with_top_posts=namespace.posts is not None, sort_by=namespace.sort_by, format='csv_row', platforms=namespace.platforms) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) except Exception as err: raise err if namespace.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow([url] + post) enricher.writerow(row, stats) loading_bar.update()
def create(self, request): #checks if url field is blank if request.data.get('url') == None: return Response({"message": "URL field cannot be blank"}, status=status.HTTP_400_BAD_REQUEST) #proccesses a malformed url,rejects values that do not form valid url schemed_url = ensure_protocol( request.data.get('url'), protocol='https') if is_url(schemed_url) == False: return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST) #checks if name input contains only letters,numbers,underscore and hyphen pattern = "^[A-Za-z0-9_-]*$" name=request.data.get('name') if name is not None and bool(re.match(pattern, name)) == False: return Response({"message": "Name can contain only letters,numbers,underscore and hyphen"}, status=status.HTTP_400_BAD_REQUEST) # maps request data to serializer class to get an object serializer = serializers.MemeSerializer(data={"name": request.data.get( 'name'), "url": schemed_url, "caption": request.data.get("caption")}) # checks validity of the serializer object whether all required fields are present if serializer.is_valid(): # extract the various parameters sent in request data creator = serializer.data.get('name') caption = serializer.data.get('caption') url = serializer.data.get('url') # set creationDateTime,creationDate,lastUpdate as current date and time creationDateTime = timezone.now() creationDate = date.today() updatedDateTime = timezone.now() # create a meme object with data extracted obj = Meme(caption=caption, url=schemed_url, name=creator, creationDateTime=creationDateTime, creationDate=creationDate, lastUpdate=updatedDateTime) # check if meme object already exists query_obj2 = Meme.objects.filter(url=schemed_url).filter( name=creator).filter(caption=caption) if len(query_obj2) >= 1: return Response({'message': 'This meme already exists'}, status=status.HTTP_409_CONFLICT) # if meme object does not exit create a new meme by saving it to database obj.save() # get the id of the meme object created postCounter = obj.id # return the id of the meme object created with accepted status code return Response({'id': str(postCounter)}, status=status.HTTP_201_CREATED) else: """if any required data was missing or if serializer object could not be created,return the exact serialization error that occured with bad request status code""" return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookCommentScraper(namespace.cookie) except FacebookInvalidCookieError: if namespace.cookie in ['firefox', 'chrome']: die('Could not extract cookies from %s.' % namespace.cookie) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not is_facebook_post_url(url): loading_bar.close() die('Given url (line %i) is not a Facebook post url: %s' % (i + 1, url)) batches = scraper(url, per_call=True, detailed=True, format='csv_row') for details, batch in batches: for comment in batch: enricher.writerow(row, comment) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def url_parse_action(namespace): output_file = open_output_file(namespace.output) headers = REPORT_HEADERS if namespace.facebook: headers = FACEBOOK_REPORT_HEADERS elif namespace.youtube: headers = YOUTUBE_REPORT_HEADERS enricher = casanova.enricher( namespace.file, output_file, add=headers, keep=namespace.select ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total ) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True, require_protocol=False): enricher.writerow(row) continue if namespace.facebook: addendum = extract_facebook_addendum(url) elif namespace.youtube: addendum = extract_youtube_addendum(url) else: addendum = extract_standard_addendum(namespace, url) if addendum is None: enricher.writerow(row) continue enricher.writerow(row, addendum) output_file.close()
def facebook_url_likes_action(namespace): output_file = open_output_file(namespace.output) if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) loading_bar = tqdm( desc='Retrieving likes', dynamic_ncols=True, unit=' urls', total=namespace.total ) http = create_pool() for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url = url.strip() err, html = make_request(http, url) if err is not None: loading_bar.close() die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.close() die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def url_parse_action(cli_args): headers = REPORT_HEADERS if cli_args.facebook: headers = FACEBOOK_REPORT_HEADERS elif cli_args.youtube: headers = YOUTUBE_REPORT_HEADERS enricher = casanova.enricher(cli_args.file, cli_args.output, add=headers, keep=cli_args.select) loading_bar = LoadingBar(desc='Parsing', unit='row', total=cli_args.total) for row, cell in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() if cli_args.separator: urls = cell.split(cli_args.separator) else: urls = [cell] for url in urls: url = url.strip() if not is_url( url, allow_spaces_in_path=True, require_protocol=False): enricher.writerow(row) continue if cli_args.facebook: addendum = extract_facebook_addendum(url) elif cli_args.youtube: addendum = extract_youtube_addendum(url) else: addendum = extract_standard_addendum(cli_args, url) if addendum is None: enricher.writerow(row) continue enricher.writerow(row, addendum)
def facebook_url_likes_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=REPORT_HEADERS, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES ) if cli_args.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column ]) loading_bar = LoadingBar( desc='Retrieving likes', unit='url', total=enricher.total ) for row, url in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() url = url.strip() if not url or not is_url(url, require_protocol=False): enricher.writerow(row) continue err, html = make_request(url) if err is not None: loading_bar.die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm(desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True): enricher.writerow(row) continue enricher.writerow(row, [ normalize_url(url, strip_protocol=namespace.strip_protocol, strip_trailing_slash=True), get_domain_name(url), get_hostname(url), get_normalized_hostname(url) ]) output_file.close()
def partial_update(self, request, pk=None): #if id input is anything other than positive integer,return status bad request try: val = int(pk) except ValueError: return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST) #if id input is negative integer,return status bad request if int(pk) < 0: return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST) # get meme object by its id queryset = Meme.objects.filter(id=pk) # check if meme object exists,if it does not return http status not found if len(queryset) == 0: return Response(status=status.HTTP_404_NOT_FOUND) if request.data.get('name') != None: return Response({"message": "Creator name cannot be changed!!"}, status=status.HTTP_400_BAD_REQUEST) if request.data.get('url') == None and request.data.get('caption') == None: return Response({"message": "Both url and caption cannot be none"}, status=status.HTTP_400_BAD_REQUEST) url = request.data.get('url') caption = request.data.get('caption') # if only caption is supplied,only caption is updated if url == None and caption != None: obj = queryset[0] obj.caption = caption obj.lastUpdate = timezone.now() obj.save() # return response no content if successfully updated return Response(status=status.HTTP_204_NO_CONTENT) # if url entered has no scheme,add scheme to url and check if a valid url is formed schemed_url = ensure_protocol( request.data.get('url'), protocol='https') if is_url(schemed_url) == False: return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST) # if meme with that id exists,map request data to serializer to extract attributes based on data supplied # ie either url or caption or both if url != None and caption != None: serializer = serializers.MemeUpdateSerializer( data={"url": schemed_url, "caption": request.data.get("caption")}, partial=True) else: serializer = serializers.MemeUpdateSerializer( data={"url": schemed_url}, partial=True) # check if serializer object is valid i.e all required fields are present and no extra fields are present if serializer.is_valid(): obj = queryset[0] # extract the caption and url of the meme object oldCaption = obj.caption oldUrl = obj.url # set caption and url sent as request to new caption and new url newCaption = serializer.data.get('caption') newUrl = serializer.data.get('url') # check if new caption is not same as existing caption.If not then update caption field of meme object. if newCaption is not None and newCaption != oldCaption: obj.caption = newCaption # check if new url is not same as existing url.If not then update url field of meme object. if newUrl is not None and newUrl != oldUrl: obj.url = newUrl # if any of the fields were updated,set lastUpdate field of the object to current date and time if newUrl != oldUrl or newCaption != oldCaption: obj.lastUpdate = timezone.now() # save the meme object obj.save() # return response no content if successfully updated return Response(status=status.HTTP_204_NO_CONTENT) else: return Response(serializer.errors, status=status.HTTP_404_NOT_FOUND)
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookMobileScraper(namespace.cookie, throttle=namespace.throttle) except FacebookInvalidCookieError: if namespace.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % namespace.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not has_facebook_comments(url): tqdm.write( 'Given url (line %i) probably cannot have Facebook comments: %s' % (i + 1, url), file=sys.stderr) continue batches = scraper.comments(url, per_call=True, detailed=True) for details, batch in batches: for comment in batch: enricher.writerow(row, comment.as_csv_row()) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def test_basics(self): for url, result in DEFAULT_TESTS: assert is_url(url) == result for url, result in NO_PROTOCOL_TESTS: assert is_url(url, require_protocol=False) == result
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? if namespace.file is sys.stdin and is_url(namespace.column): namespace.file = StringIO('url\n%s' % namespace.column) namespace.column = 'url' # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True input_headers, pos, reader = custom_reader(namespace.file, namespace.column) filename_pos = input_headers.index( namespace.filename) if namespace.filename else None indexed_input_headers = {h: p for p, h in enumerate(input_headers)} selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v # Reading output output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS if namespace.contents_in_report: output_headers.append('raw_content') flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) output_writer = csv.writer(output_file) if not resuming: output_writer.writerow(output_headers) else: # Reading report to know what need to be done _, rpos, resuming_reader = custom_reader(output_file, 'line') resuming_reader_loading = tqdm(resuming_reader, desc='Resuming', dynamic_ncols=True, unit=' lines') already_done = ContiguousRangeSet() for line in resuming_reader_loading: index = line[rpos] already_done.add(int(index)) # Loading bar total = namespace.total if total is not None and resuming: total -= len(already_done) loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): line = item[1] url = line[pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, line, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): if selected_pos: line = [line[p] for p in selected_pos] line.extend([ index, resolved or '', status or '', error or '', filename or '', encoding or '' ]) if namespace.contents_in_report: line.append(data or '') output_writer.writerow(line) errors = 0 status_codes = Counter() target_iterator = enumerate(reader) if resuming: target_iterator = (pair for pair in target_iterator if not already_done.stateful_contains(pair[0])) multithreaded_iterator = multithreaded_fetch(target_iterator, key=url_key, request_args=request_args, threads=namespace.threads, throttle=namespace.throttle) for result in multithreaded_iterator: line_index, line = result.item if not result.url: write_output(line_index, line) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=line[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, line)) else: filename = line[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( line_index, line, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(line_index, line, error=error_code) # Closing files if namespace.output is not None: output_file.close()
def export_google_sheets_as_csv(url, cookie=None, authuser=None, max_authuser_attempts=4): if is_url(url): parsed = parse_google_drive_url(url) if parsed is None or parsed.type != 'spreadsheets': raise GoogleSheetsInvalidTargetError else: parsed = GoogleDriveFile('spreadsheets', url) base_export_url = parsed.get_export_url() export_url = base_export_url if authuser is not None: if not isinstance(authuser, int) or authuser < 0: raise TypeError('authuser should be an int >= 0') export_url = append_authuser(export_url, authuser) max_authuser_attempts = 1 else: authuser = 0 if cookie is not None and cookie in COOKIE_BROWSERS: jar = getattr(browser_cookie3, cookie)() resolver = CookieResolver(jar) cookie = resolver(export_url) if cookie is None: raise GoogleSheetsMissingCookieError attempts = max_authuser_attempts while True: attempts -= 1 err, response = request(export_url, cookie=cookie) if err: raise err if response.status == 404: raise GoogleSheetsNotFoundError if response.status == 401: raise GoogleSheetsUnauthorizedError if response.status == 403: authuser += 1 if attempts != 0: export_url = append_authuser(base_export_url, authuser) continue raise GoogleSheetsMaxAttemptsExceeded if 'csv' not in response.headers.get('Content-Type', '').lower(): raise GoogleSheetsInvalidContentTypeError break return response.data.decode('utf-8')
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? single_url = namespace.file is sys.stdin and is_url(namespace.column) if single_url: edit_namespace_with_csv_io(namespace, 'url') # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) # Resume listener listener = None resuming_reader_loading = None skipped = 0 if resuming: resuming_reader_loading = tqdm(desc='Resuming', dynamic_ncols=True, unit=' lines') def listener(event, row): nonlocal skipped if event == 'resume.output': resuming_reader_loading.update() if event == 'resume.input': skipped += 1 loading_bar.set_postfix(skipped=skipped) loading_bar.update() # Enricher enricher = casanova.threadsafe_enricher( namespace.file, output_file, resumable=resuming, auto_resume=False, add=OUTPUT_ADDITIONAL_HEADERS + (['raw_contents'] if namespace.contents_in_report else []), keep=namespace.select, listener=listener) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) url_pos = enricher.pos[namespace.column] filename_pos = None if namespace.filename is not None: if namespace.filename not in enricher.pos: die([ 'Could not find the "%s" column containing the filenames in the given CSV file.' % namespace.filename ]) filename_pos = enricher.pos[namespace.filename] indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)} if resuming: enricher.resume() resuming_reader_loading.close() # Loading bar total = namespace.total loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): url = item[1][url_pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', encoding or '' ] if namespace.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() fetch_kwargs = { 'threads': namespace.threads, 'throttle': namespace.throttle, 'domain_parallelism': namespace.domain_parallelism } if namespace.timeout is not None: fetch_kwargs['timeout'] = namespace.timeout multithreaded_iterator = multithreaded_fetch(enricher, key=url_key, request_args=request_args, **fetch_kwargs) for result in multithreaded_iterator: index, row = result.item if not result.url: write_output(index, row) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=row[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, row)) else: filename = row[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( index, row, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(index, row, error=error_code) # Closing files output_file.close()