def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.NEWSLETTER.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} newsletters have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more newsletters') return for element in reversed(elements): href = element['href'] title = element.text model, created = session.get_or_create(Newsletter, url=href, title=title) if not created: LOGGER.info( f'Newsletter "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def unexpected_tag(self, tag, tagName): length = self.read_uint16() LOGGER.log( CustomLoggingLevel.EXTRA_DATA, '[0x%x] tag %s(%s) appears unexpected, length: %d.' % (self.fileObject.cur() - 2, tagName, tag.encode('hex'), length)) self.fileObject.read(length - 2)
def read_scandata(self): curPos = self.fileObject.cur() self.scanDataPos = curPos LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, 'Start to read scan data.') # read all data to improve process speed tmpdata = self.fileObject.read(self.fileObject.size - curPos) index = 0 while self.scanFlag == True: if tmpdata[index] == '\xff': if tmpdata[index + 1] == '\xd9': self.tag_eoi('\xff\xd9') else: self.scanData.append(tmpdata[index]) self.scanData.append(tmpdata[index + 1]) index += 2 else: self.scanData.append(tmpdata[index]) index += 1 if index < len(tmpdata): self.showextradata(tmpdata[index:], curPos + index) self.scanDataLength = len(self.scanData) LOGGER.log( CustomLoggingLevel.IMAGE_INFO, 'Scan data start at 0x%x, length: 0x%x.' % (curPos, self.scanDataLength))
def request(self): with Database() as session: with Browser() as browser_session: url = self.url(Category.INVOICE.value) loop = True while loop: try: response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select( '#dokumenty table.tabulka tr:not(.hlavicka)') if len(elements) == 0: LOGGER.info('Done, no more invoices') return for element in elements: published, _, title, _, _, _, document = element.findChildren( 'td') link = document.findChild('a').attrs.get('href') size_in_mb = re.search(r'([0-9\.]+)', document.text).groups()[0] is_pdf = re.search(r'\.pdf$', url) if is_pdf: model, created = session.get_or_create( Invoice, published=datetime.date.fromisoformat( published.text), title=title.text, url=link, size_in_mb=size_in_mb) if not created: LOGGER.info( f'Invoice {model.url} ...skipped (duplicate)' ) loop = False else: LOGGER.info(f'{model.url} ...added') else: LOGGER.warning( f'Invoice {model.url} ...skipped (not PDF)' ) next_url = soup.select_one( '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)' ).attrs.get('href') next_url = urljoin(self.base_url(response.url), next_url) # FIXME: first page can be w/o the page number if next_url == url: return url = next_url except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def showextradata(self, data, location): if len(data) > 128: tmpFileObject = FileObject(data) LOGGER.log(CustomLoggingLevel.EXTRA_DATA, '[0x%x] %s' % (location, tmpFileObject.type())) else: LOGGER.log(CustomLoggingLevel.EXTRA_DATA, '[0x%x] > %s' % (location, data))
def tag_dri(self, tag): # 0xFFDD Define Restart Interval length = self.read_uint16() - 2 curPos = '[0x%x]' % self.fileObject.cur() self.restartInterval = self.read_uint16() if length != 2: LOGGER.log(CustomLoggingLevel.EXTRA_DATA, '%s> %s' % (curPos, self.fileObject.read(length - 2))) return self.find_tag('DRI')
def clean_bitstream_remainder(self): remainder = self.streamBuffer[0] & myBitStreamMaskR[ 8 - self.bitStreamStart] if remainder != 0 and remainder != myBitStreamMaskR[ 8 - self.bitStreamStart]: LOGGER.log( CustomLoggingLevel.EXTRA_DATA, '?0x%x? Unsual end of bitstream, is %s. (0x%s)' % (self.scanDataIndex, bin(remainder), self.streamBuffer[0])) self.streamBuffer.remove(self.streamBuffer[0]) self.bitStreamStart = 0
def tag_app1(self, tag): backCurPos = self.fileObject.cur() length = self.read_uint16() magic = self.fileObject.read(6) if magic != 'Exif\x00\x00': LOGGER.warning('[0x%x] Unbale to process magic %s in APP1.' % (self.fileObject.cur(), magic)) self.fileObject.read(length - 8) return self.find_tag('APP1') self.read_tiff(length - 8, 'Exif') self.fileObject.change_cur(backCurPos + length) return self.find_tag('APP1')
def tag_sof(self, tag): # 0xFFC1~0xFFC7 0xFFC9~0xFFCF Start Of Frame length = self.read_uint16() self.encodeType = 'sofx' self.bitsPerPixel = self.fileObject.read_uint8() self.height = self.read_uint16() self.width = self.read_uint16() if self.fileObject.read(1) != '\x03': LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' % self.fileObject.cur()) comp = self.fileObject.read(9) return self.find_tag('SOFx')
def find_tag(self, tagName): if self.fileObject.read(1) != '\xFF': curPos = '[0x%x]' % self.fileObject.cur() LOGGER.error('%s Can\'t find 0xFF in end of %s.' % (curPos, tagName)) data = [] d = self.fileObject.read(1) while d != '\xFF': data.append(d) d = self.fileObject.read(1) LOGGER.log(CustomLoggingLevel.EXTRA_DATA, '%s> %s' % (curPos, ''.join(data))) return '\xff' + self.fileObject.read(1)
def tag_app0(self, tag): # 0xFFE0 APP0 length = self.read_uint16() magic = self.fileObject.read(5) if magic != 'JFIF\x00': LOGGER.warning('[0x%x] Unbale to process magic %s in APP0.' % (self.fileObject.cur(), magic)) self.version = self.read_uint16() self.fileObject.read(5) self.thumbnailX = self.fileObject.read_uint8() self.thumbnailY = self.fileObject.read_uint8() self.thumbnail = self.fileObject.read(length - 16) # RGB pixel return self.find_tag('APP0')
def read_tiff(self, length, tagName): tiffStartPos = self.fileObject.cur() if self.fileObject.read(2) == 'II': p_read_uint16 = self.fileObject.read_uint16 p_read_uint32 = self.fileObject.read_uint32 else: p_read_uint16 = self.read_uint16 p_read_uint32 = self.read_uint32 if p_read_uint16() != 0x2a: LOGGER.warning('[0x%x] TIFF data format magic check failed.' % tiffStartPos) dirEntryPos = p_read_uint32() self.read_tiff_ifd(tiffStartPos, p_read_uint16, p_read_uint32, dirEntryPos, tagName)
def tag_app(self, tag): # 0xFFE1~0xFFEE Application-specific appID = (ord(tag[0]) << 8) + ord(tag[1]) - 0xFFE0 length = self.read_uint16() - 2 data = self.fileObject.read(length) if not appID in [1, 2, 13, 14]: LOGGER.log( CustomLoggingLevel.OTHER_DATA, '[0x%x] Tag APP%d found.' % (self.fileObject.cur() - length, appID)) else: LOGGER.log( CustomLoggingLevel.OTHER_DATA, '[0x%x] Tag APP%d found, this tag usually not used in file.' % (self.fileObject.cur() - length, appID)) return self.find_tag('APP%d' % appID)
def start(self): if self.fileObject.read(2) == '\xff\xd8': # start of JPEG file tag = self.fileObject.read(2) while self.scanFlag == False and tag != None: try: tag = self.tagMap[tag](tag) except KeyError: tag = self.tag_unknown(tag) LOGGER.log( CustomLoggingLevel.IMAGE_INFO, 'JPEG (ver %d.%d): %d*%dpx , channel: %d, fileLength: 0x%x b.' % (self.version >> 8, self.version & 0xff, self.width, self.height, self.channel, self.fileObject.size)) else: LOGGER.error('JPEG file start mark 0xFFD8 check failed.')
def read_tiff_ifd(self, tiffStartPos, p_read_uint16, p_read_uint32, dirEntryPos, tagName): dirCount = 0 while dirEntryPos != 0: entryCount = p_read_uint16(tiffStartPos + dirEntryPos) LOGGER.log( CustomLoggingLevel.IMAGE_DEBUG, '[%s] Tiff data start at 0x%x, directory index: %d, start at: 0x%x, entry count: %d.' % (tagName, tiffStartPos, dirCount, dirEntryPos, entryCount)) for i in range(entryCount): try: dirTag = p_read_uint16(tiffStartPos + dirEntryPos + 2 + 12 * i) dataFormat = p_read_uint16() nComponent = p_read_uint32() dataLength = nComponent * tiffEnumDataTypeLength[dataFormat] if dataLength > 4: dataStartPos = p_read_uint32() data = self.fileObject.read( dataLength, tiffStartPos + dataStartPos) else: data = self.fileObject.read(4) if dirTag == 0x8769: self.read_tiff_ifd( tiffStartPos, p_read_uint16, p_read_uint32, p_read_uint32(tiffStartPos + dirEntryPos + 10 + 12 * i), 'SubExif') elif dirTag == 0xa005: self.read_tiff_ifd( tiffStartPos, p_read_uint16, p_read_uint32, p_read_uint32(tiffStartPos + dirEntryPos + 10 + 12 * i), 'ExifInteroperability') if dataFormat == 2: LOGGER.log( CustomLoggingLevel.IMAGE_INFO, '[%s - %s](string)> %s' % (tagName, exifEnumTag[dirTag], data.replace('\x00', ''))) else: LOGGER.log( CustomLoggingLevel.IMAGE_INFO, '[%s - %s](%s)> Hex:%s' % (tagName, exifEnumTag[dirTag], tiffEnumDataType[dataFormat], data.encode('hex'))) except KeyError or IndexError: LOGGER.warning( '[0x%x] Unable to decode dataformat or entrytag in tiff data, tagName: %s, dirTag: 0x%x, dataFormat: 0x%x, directory: %d/%d.' % (self.fileObject.cur(), tagName, dirTag, dataFormat, i, entryCount)) dirCount += 1 dirEntryPos = p_read_uint32(tiffStartPos + dirEntryPos + 2 + 12 * entryCount)
def tag_sos(self, tag): # 0xFFDA Start Of Scan self.scanFlag = True length = self.read_uint16() - 2 if self.fileObject.read(1) != '\x03': LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' % self.fileObject.cur()) comp = self.fileObject.read(3) for i in range(3): self.scanQuantization[i] = { 'DC': ord(comp[i]) >> 4, 'AC': ord(comp[i]) & 0xf } self.scanSs = self.fileObject.read(1) self.scanSe = self.fileObject.read(1) self.scanAh = ord(self.fileObject.read(1)) self.scanAl = self.scanAh & 0xf self.scanAh = self.scanAh >> 4 self.fileObject.read(3)
def tag_dht(self, tag): # 0xFFC4 Define Huffman Table(s) length = self.read_uint16() - 2 while length > 0: tableIDByte = self.fileObject.read_uint8() if tableIDByte >> 4 == 0: tableID = tableIDByte & 0xf else: tableID = 2 + tableIDByte & 0xf if tableID < 4: length -= self.huffmantree_decode(tableID) + 1 else: LOGGER.log( CustomLoggingLevel.EXTRA_DATA, '[0x%x] Unknown part of huffman table' % (self.fileObject.cur() - 1)) self.fileObject.read(length) # skip unknown part break return self.find_tag('DHT')
def tag_sof0(self, tag): # 0xFFC0 Start Of Frame length = self.read_uint16() self.encodeType = 'sof0' self.dctTransform = self.fileObject.read_uint8() self.bitsPerPixel = 8 self.height = self.read_uint16() self.width = self.read_uint16() if self.fileObject.read(1) != '\x03': LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' % self.fileObject.cur()) comp = self.fileObject.read(9) for i in range(3): self.colorQuantization[ord(comp[3 * i])] = { 'Horz': ord(comp[3 * i + 1]) >> 4, 'Vert': ord(comp[3 * i + 1]) & 0xf, 'TableID': ord(comp[3 * i + 2]) } return self.find_tag('SOF0')
def __init__(self, max_retries: int = 5): LOGGER.info("Creating browser session") self.session = Session() LOGGER.info("Injecting headers into the browser") self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/88.0", "Accept-Language": "sk,en-US;q=0.7,en;q=0.3", }) if max_retries: retry_strategy = Retry(total=max_retries) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("https://", adapter) self.session.mount("http://", adapter) super().__init__()
def cli(): return_code = 0 LOGGER.info('Start') MeetingPlugin().run() AnnouncementPlugin().run() NewsletterPlugin().run() BudgetPlugin().run() ProcurementPlugin().run() ReportPlugin().run() OrderPlugin().run() InvoicePlugin().run() ContractPlugin().run() TablePlugin().run() ResolutionPlugin().run() TranscriptPlugin().run() VZNPlugin().run() LOGGER.info('Done') return return_code
def asc_detect(filename, min_length=5): LOGGER.log(CustomLoggingLevel.OTHER_DATA, "--- ascii detect start --- ") def is_readable(c): readable_chars = "abcdefghijklmnopqrstuvwxyz" + \ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \ "0123456789" + \ "`~!@#$%^&*()_+[]\{}|;':\",./<>" + \ " "+'\n'+'\t'+'\r' return c < 128 and chr(c) in readable_chars # LOGGER.addHandler(logging.StreamHandler()) file_object = FileObject(filename) pre = -1 data = "" for i in xrange(file_object.size): byte = file_object.read_uint8() if not is_readable(byte): length = i - pre - 1 pre = i if length >= min_length: LOGGER.log(CustomLoggingLevel.ASCII_DATA, "[ascii] at pos 0x%x:\n" % i + data) data = "" else: data += chr(byte) LOGGER.log(CustomLoggingLevel.OTHER_DATA, "--- ascii detect finished --- ")
def get_images(self): result = [] for image in self.images: # print len(image["data"]) color_table = self.globalColorTable if "localColorTableFlag" in image and image[ "localColorTableFlag"] == 1: color_table = image["localColorTable"] data = self.lzw_decode(image["data"], image["LZWMinimumCodeSize"]) w = image["width"] h = image["height"] cur = Image() cur.w = w cur.h = h cur.data = [color_table[i] for i in data] result.append(cur) if len(cur.data) != cur.w * cur.h: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "image %d has wrong width or height " % len(self.result)) return result
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.BUDGET.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} budgets have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more budgets') return for element in reversed(elements): href = element['href'] title = element.text if not re.search(r'\d', title): sufix = element.findPreviousSibling('h2').text title = f'{title} {sufix}' model, created = session.get_or_create(Budget, url=href, title=title) if not created: LOGGER.info( f'Budget "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.REPORT.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left tr') LOGGER.info('{} reports have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more reports') return for element in elements: date, title = element.findChildren('td') title = re.sub(r'[\n\s]+', ' ', title.text) model, created = session.get_or_create( Report, date=date.text.strip(), title=title.strip()) if not created: LOGGER.info( f'Report "{model.title[:30]}..." ...skipped (duplicate)' ) else: LOGGER.info(f'"{model.title[:40]}..." ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
plugin_folder = project_path('commands') class CLI(click.MultiCommand): def list_commands(self, ctx): rv = [] for filename in os.listdir(plugin_folder): if filename.endswith('.py'): rv.append(filename[:-3]) rv.sort() return rv def get_command(self, ctx, name): ns = {} fn = os.path.join(plugin_folder, name + '.py') with open(fn) as f: code = compile(f.read(), fn, 'exec') eval(code, ns, ns) return ns['cli'] cli = CLI() if __name__ == '__main__': try: LOGGER.info(' '.join(sys.argv)) sys.exit(cli(standalone_mode=False)) except Exception as e: LOGGER.exception(e)
def after_request(self): LOGGER.info('Finished scrapping invoices')
def before_request(self): LOGGER.info('Start scrapping invoices')
def after_request(self): LOGGER.info('Finished scrapping newsletter')
def before_request(self): LOGGER.info('Start scrapping newsletter')
def _log_post_request(self, response): LOGGER.info(f'Response status: {response.status}') LOGGER.info(f'Response json: {response.json}') return response