def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.NEWSLETTER.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} newsletters have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more newsletters') return for element in reversed(elements): href = element['href'] title = element.text model, created = session.get_or_create(Newsletter, url=href, title=title) if not created: LOGGER.info( f'Newsletter "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): with Database() as session: with Browser() as browser_session: url = self.url(Category.INVOICE.value) loop = True while loop: try: response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select( '#dokumenty table.tabulka tr:not(.hlavicka)') if len(elements) == 0: LOGGER.info('Done, no more invoices') return for element in elements: published, _, title, _, _, _, document = element.findChildren( 'td') link = document.findChild('a').attrs.get('href') size_in_mb = re.search(r'([0-9\.]+)', document.text).groups()[0] is_pdf = re.search(r'\.pdf$', url) if is_pdf: model, created = session.get_or_create( Invoice, published=datetime.date.fromisoformat( published.text), title=title.text, url=link, size_in_mb=size_in_mb) if not created: LOGGER.info( f'Invoice {model.url} ...skipped (duplicate)' ) loop = False else: LOGGER.info(f'{model.url} ...added') else: LOGGER.warning( f'Invoice {model.url} ...skipped (not PDF)' ) next_url = soup.select_one( '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)' ).attrs.get('href') next_url = urljoin(self.base_url(response.url), next_url) # FIXME: first page can be w/o the page number if next_url == url: return url = next_url except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.PROCUREMENT.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') main_element = soup.select_one('#content-left ol') procurements = main_element.select('a') offers = main_element.find_next_siblings('a') LOGGER.info('{} procurements have been found'.format( len(procurements))) if len(procurements) == 0: LOGGER.info('Done, no more procurements') for element in procurements: href = element['href'] title = element.text model, created = session.get_or_create(Procurement, url=href, title=title) if not created: LOGGER.info( f'Procurement "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') LOGGER.info('{} offers have been found'.format( len(offers))) if len(offers) == 0: LOGGER.info('Done, no more offers') return for element in offers: href = element['href'] title = element.text model, created = session.get_or_create(Procurement, url=href, title=title, is_offer=True) if not created: LOGGER.info( f'Offer "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def read_tiff_ifd(self, tiffStartPos, p_read_uint16, p_read_uint32, dirEntryPos, tagName): dirCount = 0 while dirEntryPos != 0: entryCount = p_read_uint16(tiffStartPos + dirEntryPos) LOGGER.log( CustomLoggingLevel.IMAGE_DEBUG, '[%s] Tiff data start at 0x%x, directory index: %d, start at: 0x%x, entry count: %d.' % (tagName, tiffStartPos, dirCount, dirEntryPos, entryCount)) for i in range(entryCount): try: dirTag = p_read_uint16(tiffStartPos + dirEntryPos + 2 + 12 * i) dataFormat = p_read_uint16() nComponent = p_read_uint32() dataLength = nComponent * tiffEnumDataTypeLength[dataFormat] if dataLength > 4: dataStartPos = p_read_uint32() data = self.fileObject.read( dataLength, tiffStartPos + dataStartPos) else: data = self.fileObject.read(4) if dirTag == 0x8769: self.read_tiff_ifd( tiffStartPos, p_read_uint16, p_read_uint32, p_read_uint32(tiffStartPos + dirEntryPos + 10 + 12 * i), 'SubExif') elif dirTag == 0xa005: self.read_tiff_ifd( tiffStartPos, p_read_uint16, p_read_uint32, p_read_uint32(tiffStartPos + dirEntryPos + 10 + 12 * i), 'ExifInteroperability') if dataFormat == 2: LOGGER.log( CustomLoggingLevel.IMAGE_INFO, '[%s - %s](string)> %s' % (tagName, exifEnumTag[dirTag], data.replace('\x00', ''))) else: LOGGER.log( CustomLoggingLevel.IMAGE_INFO, '[%s - %s](%s)> Hex:%s' % (tagName, exifEnumTag[dirTag], tiffEnumDataType[dataFormat], data.encode('hex'))) except KeyError or IndexError: LOGGER.warning( '[0x%x] Unable to decode dataformat or entrytag in tiff data, tagName: %s, dirTag: 0x%x, dataFormat: 0x%x, directory: %d/%d.' % (self.fileObject.cur(), tagName, dirTag, dataFormat, i, entryCount)) dirCount += 1 dirEntryPos = p_read_uint32(tiffStartPos + dirEntryPos + 2 + 12 * entryCount)
def tag_app1(self, tag): backCurPos = self.fileObject.cur() length = self.read_uint16() magic = self.fileObject.read(6) if magic != 'Exif\x00\x00': LOGGER.warning('[0x%x] Unbale to process magic %s in APP1.' % (self.fileObject.cur(), magic)) self.fileObject.read(length - 8) return self.find_tag('APP1') self.read_tiff(length - 8, 'Exif') self.fileObject.change_cur(backCurPos + length) return self.find_tag('APP1')
def tag_app0(self, tag): # 0xFFE0 APP0 length = self.read_uint16() magic = self.fileObject.read(5) if magic != 'JFIF\x00': LOGGER.warning('[0x%x] Unbale to process magic %s in APP0.' % (self.fileObject.cur(), magic)) self.version = self.read_uint16() self.fileObject.read(5) self.thumbnailX = self.fileObject.read_uint8() self.thumbnailY = self.fileObject.read_uint8() self.thumbnail = self.fileObject.read(length - 16) # RGB pixel return self.find_tag('APP0')
def read_tiff(self, length, tagName): tiffStartPos = self.fileObject.cur() if self.fileObject.read(2) == 'II': p_read_uint16 = self.fileObject.read_uint16 p_read_uint32 = self.fileObject.read_uint32 else: p_read_uint16 = self.read_uint16 p_read_uint32 = self.read_uint32 if p_read_uint16() != 0x2a: LOGGER.warning('[0x%x] TIFF data format magic check failed.' % tiffStartPos) dirEntryPos = p_read_uint32() self.read_tiff_ifd(tiffStartPos, p_read_uint16, p_read_uint32, dirEntryPos, tagName)
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.BUDGET.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} budgets have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more budgets') return for element in reversed(elements): href = element['href'] title = element.text if not re.search(r'\d', title): sufix = element.findPreviousSibling('h2').text title = f'{title} {sufix}' model, created = session.get_or_create(Budget, url=href, title=title) if not created: LOGGER.info( f'Budget "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.REPORT.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left tr') LOGGER.info('{} reports have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more reports') return for element in elements: date, title = element.findChildren('td') title = re.sub(r'[\n\s]+', ' ', title.text) model, created = session.get_or_create( Report, date=date.text.strip(), title=title.strip()) if not created: LOGGER.info( f'Report "{model.title[:30]}..." ...skipped (duplicate)' ) else: LOGGER.info(f'"{model.title[:40]}..." ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def __init__(self, file_object): # file_object.addHandler(logging.StreamHandler()) self.fileObject = file_object if file_object.read(3) != 'GIF': LOGGER.error("File is not a gif file") self.type = "GIF" self.version = file_object.read(3) if self.version != '87a' and self.version != '89a': LOGGER.log(CustomLoggingLevel.OTHER_DATA, "Invalid version") else: LOGGER.log(CustomLoggingLevel.BASIC_DEBUG, "version is " + self.version) self.logicScreenWidth = file_object.read_uint16() self.logicScreenHeight = file_object.read_uint16() mask = file_object.read_uint8() self.pixel = mask & 0b111 mask >>= 3 self.sortFlag = mask & 0b1 mask >>= 1 self.colorResolution = mask & 0b111 mask >>= 3 self.globalColorTableFlag = mask & 0b1 if self.version == "89a": self.backgroundColorIndex = file_object.read_uint8() self.pixelAspectRatio = file_object.read_uint8() # self.globalColorTable = [[0, 0, 0]] * (2 ** (self.pixel + 1)) if self.globalColorTableFlag: self.globalColorTable = [[0, 0, 0] for _ in range(2**(self.pixel + 1))] else: self.globalColorTable = [] LOGGER.log(CustomLoggingLevel.OTHER_DATA, "global table size is %d" % len(self.globalColorTable)) for i in range(len(self.globalColorTable)): for j in range(3): # 0 red 1 green 2 blue self.globalColorTable[i][j] = file_object.read_uint8() self.images = [] image = {} while True: tag = file_object.read_uint8() if tag == 0x3b: LOGGER.log(CustomLoggingLevel.OTHER_DATA, "gif end") break # end of gif if tag == 0x2c: # start of a image descriptor # LOGGER.info("image descriptor") LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "image descriptor") image["xOffset"] = file_object.read_uint16() image["yOffset"] = file_object.read_uint16() image["width"] = file_object.read_uint16() image["height"] = file_object.read_uint16() if image["xOffset"] + image["width"] > self.logicScreenWidth or \ image["yOffset"] + image["height"] > self.logicScreenHeight: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "some part out of logic screen at image %d" % len(self.images) + 1) mask = file_object.read_uint8() image["pixel"] = mask & 0b111 mask >>= 3 image["reserved"] = mask & 0b11 if image["reserved"] != 0: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "[0x%x] reserved data should be 0" % self.fileObject.cur()) mask >>= 2 image["sortFlag"] = mask & 0b1 mask >>= 1 image["interlaceFlag"] = mask & 0b1 mask >>= 1 image["localColorTableFlag"] = mask & 0b1 if image["localColorTableFlag"]: image["localColorTable"] = [ [0, 0, 0] for _ in xrange((2**(image["pixel"] + 1))) ] for i in range(len(image["localColorTable"])): for j in range(3): # 0 red 1 green 2 blue image["localColorTable"][i][ j] = file_object.read_uint8() elif tag == 0x21: if self.version != "89a": LOGGER.log(CustomLoggingLevel.OTHER_DATA, "not version 89a but has extension segment.") sub_tag = file_object.read_uint8() if sub_tag == 0xF9: # Graphic Control Extension. LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "Graphic Control Extension") block_size = file_object.read_uint8() if block_size != 4: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "block size is not 4 in Graphic Control Extension") control = {} mask = file_object.read_uint8() control["transparentFlag"] = mask & 0b1 mask >>= 1 control["userInputFlag"] = mask & 0b1 mask >>= 1 control["disposalMethod"] = mask & 0b111 # 0 - No disposal specified. The decoder is # not required to take any action. # 1 - Do not dispose. The graphic is to be left # in place. # 2 - Restore to background color. The area used by the # graphic must be restored to the background color. # 3 - Restore to previous. The decoder is required to # restore the area overwritten by the graphic with # what was there prior to rendering the graphic. # 4-7 - To be defined. control["delayTime"] = file_object.read_uint16() control["TransparentColonrIndex"] = file_object.read_uint8( ) terminator = file_object.read_uint8() if terminator != 0: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "[0x%x] terminator in block Graphic Control Extension is not 0" % self.fileObject.cur()) image["control"] = control elif sub_tag == 0xFE: # Comment Extension. LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "Comment Extension.") if "comment" not in image: image["comment"] = "" while True: tmp = file_object.read(1) if tmp == '\0': break image["comment"] += tmp LOGGER.log(CustomLoggingLevel.ASCII_DATA, image["comment"]) elif sub_tag == 0x01: # plain text Extension LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "plain text Extension") block_size = file_object.read_uint8() if block_size != 12: LOGGER.warning("block size is not 12 in plain text") text = { "gridLeftPosition": file_object.read_uint16(), "gridTopPosition": file_object.read_uint16(), "textGridWidth": file_object.read_uint16(), "textGridHeight": file_object.read_uint16(), "characterCellWidth": file_object.read_uint8(), "characterCellHeight": file_object.read_uint8(), "textForegroundColorIndex": file_object.read_uint8(), "textBackgroundColorIndex": file_object.read_uint8(), "data": "" } while True: tmp = file_object.read(1) if tmp == '\0': break text["data"] += tmp if "text" in image: LOGGER.log(CustomLoggingLevel.OTHER_DATA, "text already in image") image["text"] = text LOGGER.log(CustomLoggingLevel.ASCII_DATA, image["text"]) elif sub_tag == 0xFF: # Application Extension. LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "Application Extension.") block_size = file_object.read_uint8() if block_size != 11: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "[0x%x] block size is not 11 in application extension" % self.fileObject.cur()) application = { "identifier": file_object.read(8), "authenticationCode": file_object.read(3) } data_size = file_object.read_uint8() application["data"] = file_object.read(data_size) if "application" in image: LOGGER.log(CustomLoggingLevel.OTHER_DATA, "application Extension already in image") image["application"] = application terminator = file_object.read_uint8() if terminator != 0: LOGGER.log( CustomLoggingLevel.OTHER_DATA, "terminator is not 0 in Application Extension") else: LOGGER.log( CustomLoggingLevel.IMAGE_DEBUG, "[0x%x] unknown extension at" % self.fileObject.cur()) else: # DATA # LOGGER.info("DATA") LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "DATA") image["LZWMinimumCodeSize"] = tag image["data"] = [] while True: data_size = file_object.read_uint8() if data_size == 0: break data = file_object.read(data_size) image["data"] += data self.images.append(image) image = {}
def rowdata_ver23(self): rowData = [] if self.compressionMethod != 0: # decompress bitmap data according to compression method if self.bitmapLength == 0: LOGGER.warning( 'BitmapLength shouldn\'t be 0 in bitmap header! There may have some extra data in end of the file.' ) tdata = self.fileObject.read(self.fileObject.size - self.headerLength) else: tdata = self.fileObject.read(self.bitmapLength) # decompress data = [] if self.compressionMethod == 1: specialFlag = -1 for i in range(len(tdata)): if specialFlag < 0: if specialFlag == -1: if tdata[i] == '\x00': pass # end of line elif tdata == '\x01': break # end of RLE data elif tdata[i] == '\x02': data.append('\x00' * (ord(tdata[i + 1]) + self.width * ord(tdata[i + 2])) * self.bitsPerPixel / 8) else: specialFlag = ord(tdata[i]) + 1 specialFlag -= 1 if specialFlag == -3: specialFlag = 0 elif specialFlag == 0: if tdata[i] == '\x00': specialFlag = -1 elif specialFlag > 1: data.append(tdata[i]) specialFlag -= 1 else: specialFlag -= 1 if i < len(tdata) - 1: self.showextradata(tdata[i:len(tdata) - 1], self.headerLength + i) data = ''.join(data) elif self.compressionMethod == 2: LOGGER.error( 'Compress method RLE4 of BMP file version 3 is not surported.' ) return elif self.compressionMethod == 3: LOGGER.error( 'Compress method using RGB mask of BMP file version 3 is not surported.' ) return else: data = self.fileObject.read(self.rowDataLength) if self.compressionMethod == 0 and self.bitmapLength != 0: LOGGER.warning( 'BitmapLength should be 0 in bitmap header! Image pixel may be processed with wrong compress method!' ) if self.bitsPerPixel in [1, 4, 8, 24, 32]: # return row data from stream if self.bitsPerPixel == 24: self.channel = 3 else: self.channel = 4 return self.decode_rgb_data(data) else: LOGGER.error( 'BMP file bits per pixel is not in (1, 4, 8, 24, 32).')