def on_body_downloaded(body): if 'content-encoding' in headers: if headers['content-encoding'] == 'gzip': body = zlib.decompress(body, 16+zlib.MAX_WBITS) elif headers['content-encoding'] == 'br': try: try: import brotlicffi as brotli except ImportError: import brotli except ImportError: print('You must run pip install scrapfly-sdk[speedups] - brotli is missing - or disable brotli compression') raise body = brotli.decompress(body) response = requests.Response() response.status_code = status_code response.reason = reason response._content = body response.headers.update(headers) response.url = request.url request.scrape_config.raise_on_upstream_error = False scrapfly_api_response:ScrapeApiResponse = spider.scrapfly_client._handle_response( response=response, scrape_config=request.scrape_config ) self._crawler.stats.inc_value('scrapfly/bandwidth_consumed', count=scrapfly_api_response.context['bandwidth_consumed']) return ScrapflyScrapyResponse(request=request, scrape_api_response=scrapfly_api_response)
def create_new_course(title: str, count: int, headers: dict) -> str: course_title = title + ' ' + str(count) url = 'https://iknow.jp/custom/courses' course = urllib.parse.quote_plus(course_title) payload = 'utf8=%E2%9C%93&goal%5Bname%5D={name}&language={lang}&translation_language={l}&goal%5Bicon_image_url%5D=&commit=Create'.format(name=course, lang='ja', l='en') try: res = requests.post(url, data=payload, headers=headers) except: print('Failed to post new course ' + course_title) return '' res.encoding = 'utf-8' if res.status_code != requests.codes.ok: # Mark as a word we couldn't add - will process later print('Unable to make a new course!!') print('Provided title: ' + course_title) return '' try: res_decoded = brotlicffi.decompress(res.content) except brotlicffi.Error: print('Could not decompress our response from creating a course!') return '' # The response content is some jquery, which contains the course id response_content = res_decoded.decode('utf-8') match = re.search(r'/custom/courses/(\d*)', response_content) if not match: return '' else: course_id = str(match[1]) return course_id
def test_streaming_compression_flush(one_compressed_file, chunk_size, mode, quality, lgwin, lgblock): """ Confirm that the streaming compressor works as expected, including flushes after each chunk. """ compressed_chunks = [] c = brotlicffi.Compressor( mode=mode, quality=quality, lgwin=lgwin, lgblock=lgblock ) with open(one_compressed_file, 'rb') as f: while True: next_data = f.read(chunk_size) if not next_data: break compressed_chunks.append(c.compress(next_data)) compressed_chunks.append(c.flush()) compressed_chunks.append(c.finish()) decompressed = brotlicffi.decompress(b''.join(compressed_chunks)) with open(one_compressed_file, 'rb') as f: assert decompressed == f.read()
def test_roundtrip_compression_with_files(simple_compressed_file): """ Roundtripping data through the compressor works correctly. """ with open(simple_compressed_file[0], 'rb') as f: uncompressed_data = f.read() assert brotlicffi.decompress( brotlicffi.compress(uncompressed_data)) == uncompressed_data
def test_decompression(simple_compressed_file): """ Decompressing files returns their original form using decompress. """ with open(simple_compressed_file[0], 'rb') as f: uncompressed_data = f.read() with open(simple_compressed_file[1], 'rb') as f: compressed_data = f.read() assert brotlicffi.decompress(compressed_data) == uncompressed_data
def __init__(self, file, checkChecksums=0, fontNumber=-1): if not haveBrotli: log.error( 'The WOFF2 decoder requires the Brotli Python extension, available at: ' 'https://github.com/google/brotli') raise ImportError("No module named brotli") self.file = file signature = Tag(self.file.read(4)) if signature != b"wOF2": raise TTLibError("Not a WOFF2 font (bad signature)") self.file.seek(0) self.DirectoryEntry = WOFF2DirectoryEntry data = self.file.read(woff2DirectorySize) if len(data) != woff2DirectorySize: raise TTLibError('Not a WOFF2 font (not enough data)') sstruct.unpack(woff2DirectoryFormat, data, self) self.tables = OrderedDict() offset = 0 for i in range(self.numTables): entry = self.DirectoryEntry() entry.fromFile(self.file) tag = Tag(entry.tag) self.tables[tag] = entry entry.offset = offset offset += entry.length totalUncompressedSize = offset compressedData = self.file.read(self.totalCompressedSize) decompressedData = brotli.decompress(compressedData) if len(decompressedData) != totalUncompressedSize: raise TTLibError( 'unexpected size for decompressed font data: expected %d, found %d' % (totalUncompressedSize, len(decompressedData))) self.transformBuffer = BytesIO(decompressedData) self.file.seek(0, 2) if self.length != self.file.tell(): raise TTLibError("reported 'length' doesn't match the actual file size") self.flavorData = WOFF2FlavorData(self) # make empty TTFont to store data while reconstructing tables self.ttFont = TTFont(recalcBBoxes=False, recalcTimestamp=False)
def create_new_item(course: str, course_id: str, word: dict, headers: dict) -> str: add_new_item_url = 'https://iknow.jp/custom/courses/{course_id}/items'.format(course_id=course_id) if word['word'] in previously_added or word['word'] in added: return '' if word['definition'] == BAD_DEF or word['reading'] == BAD_READING: # The kindle json couldn't figure these out, let's not add them and move on. print('Either bad reading or def for: ' + word['word']) fail_to_add_dict = { 'course': course, 'course_id': course_id, 'word': word['word'] } failed_to_add.append(fail_to_add_dict) return '' cur_word = urllib.parse.quote_plus(word['word'], encoding='utf-8') # Don't try to add words we've added in the past reading = urllib.parse.quote_plus(word['reading'], encoding='utf-8') definition = urllib.parse.quote_plus(word['definition'], encoding='utf-8') pos_list = word['part_of_speech'].split(',') pos = 'NONE' # Default to none # TODO: This chunk doesn't seem to work - part of speech wasn't added for any of my uploads for pos in pos_list: if pos.lower() in valid_parts_of_speech: pos = pos_map.get(pos) # Quit on first match break # implied else is either no PoS given, or can't map to anything. Keep as NONE ''' This is the form iKnow sends: item[cue][text]=減点 item[cue][language]=ja item[cue][transliteration]=げんてん item[cue][part_of_speech]=N item[response][text]=subtracting points item[response][language]=en ''' cueString = 'item%5Bcue%5D%5Btext%5D={encodedCue}&item%5Bcue%5D%5Blanguage%5D={cueLang}&item%5Bcue%5D%5Btransliteration%5D={encodedCueTransliteration}&item%5Bcue%5D%5Bpart_of_speech%5D={cuePoS}'.format(encodedCue=cur_word, cueLang='ja', encodedCueTransliteration=reading, cuePoS=pos) responseString = '&item%5Bresponse%5D%5Btext%5D={responseText}&item%5Bresponse%5D%5Blanguage%5D={responseLang}'.format(responseText=definition, responseLang='en') payload = cueString + responseString ''' Example payload: item%5Bcue%5D%5Btext%5D=鼻歌&item%5Bcue%5D%5Blanguage%5D=jp&item%5Bcue%5D%5Btransliteration%5D=はなうた&item%5Bcue%5D%5Bpart_of_speech%5D=&item%5Bresponse%5D%5Btext%5D=humming, crooning&item%5Bresponse%5D%5Blanguage%5D=en ''' try: res = requests.post(add_new_item_url, data=payload, headers=headers) except: fail_to_add_dict = { 'course': course, 'course_id': course_id, 'word': word['word'] } failed_to_add.append(fail_to_add_dict) print('Failed to post new word ' + word['word']) return '' # Handler for wierd bug I encountered where res came back as None- maybe just due to forced exit if not res: fail_to_add_dict = { 'course': course, 'course_id': course_id, 'word': word['word'] } failed_to_add.append(fail_to_add_dict) print('Failed to post new word ' + word['word'] + ' - no response') return '' res.encoding = 'utf-8' if res.status_code != requests.codes.ok: # Mark as a word we couldn't add fail_to_add_dict = { 'course': course, 'course_id': course_id, 'word': word['word'] } failed_to_add.append(fail_to_add_dict) else: added.add(word['word']) try: res_decoded = brotlicffi.decompress(res.content) except brotlicffi.Error as e: print(str(e)) print('Could not decompress for word: ' + word['word'] + '\'s response') print(str(res.content)) # Don't treat this as a failure to add. Just ensure that we don't try to add a sample sentence # and return a blank string return '' json_res = json.loads(res_decoded) # Grab the ID for the new flashcard we just added word_id = json_res['id'] return word_id
def _decompress(self, rawData): return brotli.decompress(rawData)
def test_compressed_data_roundtrips(s): assert brotlicffi.decompress(brotlicffi.compress(s)) == s
def test_decompression_fails_properly_on_garbage(bogus, exception_cls): """ Garbage data properly fails decompression. """ with pytest.raises(exception_cls): brotlicffi.decompress(bogus)
def unbrotli(data): '''Decompresses data for Content-Encoding: br.''' return brotli.decompress(data)