Exemple #1
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.NEWSLETTER.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} newsletters have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more newsletters')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Newsletter,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Newsletter "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #2
0
    def request(self):
        with Database() as session:
            with Browser() as browser_session:
                url = self.url(Category.INVOICE.value)
                loop = True
                while loop:
                    try:
                        response = browser_session.get(url)
                        response.raise_for_status()
                        soup = bs4.BeautifulSoup(response.content,
                                                 'html.parser')

                        elements = soup.select(
                            '#dokumenty table.tabulka tr:not(.hlavicka)')

                        if len(elements) == 0:
                            LOGGER.info('Done, no more invoices')
                            return

                        for element in elements:
                            published, _, title, _, _, _, document = element.findChildren(
                                'td')
                            link = document.findChild('a').attrs.get('href')
                            size_in_mb = re.search(r'([0-9\.]+)',
                                                   document.text).groups()[0]
                            is_pdf = re.search(r'\.pdf$', url)
                            if is_pdf:
                                model, created = session.get_or_create(
                                    Invoice,
                                    published=datetime.date.fromisoformat(
                                        published.text),
                                    title=title.text,
                                    url=link,
                                    size_in_mb=size_in_mb)
                                if not created:
                                    LOGGER.info(
                                        f'Invoice {model.url} ...skipped (duplicate)'
                                    )
                                    loop = False
                                else:
                                    LOGGER.info(f'{model.url} ...added')
                            else:
                                LOGGER.warning(
                                    f'Invoice {model.url} ...skipped (not PDF)'
                                )
                        next_url = soup.select_one(
                            '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)'
                        ).attrs.get('href')
                        next_url = urljoin(self.base_url(response.url),
                                           next_url)

                        # FIXME: first page can be w/o the page number
                        if next_url == url:
                            return
                        url = next_url
                    except requests.exceptions.HTTPError as exc:
                        LOGGER.warning('Scraping {} {} ...skipping'.format(
                            url, exc.response.status_code))
                        raise exc
Exemple #3
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.PROCUREMENT.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    main_element = soup.select_one('#content-left ol')
                    procurements = main_element.select('a')
                    offers = main_element.find_next_siblings('a')
                    LOGGER.info('{} procurements have been found'.format(
                        len(procurements)))

                    if len(procurements) == 0:
                        LOGGER.info('Done, no more procurements')

                    for element in procurements:
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Procurement,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Procurement "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')

                    LOGGER.info('{} offers have been found'.format(
                        len(offers)))

                    if len(offers) == 0:
                        LOGGER.info('Done, no more offers')
                        return

                    for element in offers:
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Procurement,
                                                               url=href,
                                                               title=title,
                                                               is_offer=True)
                        if not created:
                            LOGGER.info(
                                f'Offer "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #4
0
    def read_tiff_ifd(self, tiffStartPos, p_read_uint16, p_read_uint32,
                      dirEntryPos, tagName):
        dirCount = 0
        while dirEntryPos != 0:
            entryCount = p_read_uint16(tiffStartPos + dirEntryPos)
            LOGGER.log(
                CustomLoggingLevel.IMAGE_DEBUG,
                '[%s] Tiff data start at 0x%x, directory index: %d, start at: 0x%x, entry count: %d.'
                % (tagName, tiffStartPos, dirCount, dirEntryPos, entryCount))
            for i in range(entryCount):
                try:
                    dirTag = p_read_uint16(tiffStartPos + dirEntryPos + 2 +
                                           12 * i)
                    dataFormat = p_read_uint16()
                    nComponent = p_read_uint32()
                    dataLength = nComponent * tiffEnumDataTypeLength[dataFormat]
                    if dataLength > 4:
                        dataStartPos = p_read_uint32()
                        data = self.fileObject.read(
                            dataLength, tiffStartPos + dataStartPos)
                    else:
                        data = self.fileObject.read(4)

                    if dirTag == 0x8769:
                        self.read_tiff_ifd(
                            tiffStartPos, p_read_uint16, p_read_uint32,
                            p_read_uint32(tiffStartPos + dirEntryPos + 10 +
                                          12 * i), 'SubExif')
                    elif dirTag == 0xa005:
                        self.read_tiff_ifd(
                            tiffStartPos, p_read_uint16, p_read_uint32,
                            p_read_uint32(tiffStartPos + dirEntryPos + 10 +
                                          12 * i), 'ExifInteroperability')

                    if dataFormat == 2:
                        LOGGER.log(
                            CustomLoggingLevel.IMAGE_INFO,
                            '[%s - %s](string)> %s' %
                            (tagName, exifEnumTag[dirTag],
                             data.replace('\x00', '')))
                    else:
                        LOGGER.log(
                            CustomLoggingLevel.IMAGE_INFO,
                            '[%s - %s](%s)> Hex:%s' %
                            (tagName, exifEnumTag[dirTag],
                             tiffEnumDataType[dataFormat], data.encode('hex')))
                except KeyError or IndexError:
                    LOGGER.warning(
                        '[0x%x] Unable to decode dataformat or entrytag in tiff data, tagName: %s, dirTag: 0x%x, dataFormat: 0x%x, directory: %d/%d.'
                        % (self.fileObject.cur(), tagName, dirTag, dataFormat,
                           i, entryCount))
            dirCount += 1
            dirEntryPos = p_read_uint32(tiffStartPos + dirEntryPos + 2 +
                                        12 * entryCount)
Exemple #5
0
 def tag_app1(self, tag):
     backCurPos = self.fileObject.cur()
     length = self.read_uint16()
     magic = self.fileObject.read(6)
     if magic != 'Exif\x00\x00':
         LOGGER.warning('[0x%x] Unbale to process magic %s in APP1.' %
                        (self.fileObject.cur(), magic))
         self.fileObject.read(length - 8)
         return self.find_tag('APP1')
     self.read_tiff(length - 8, 'Exif')
     self.fileObject.change_cur(backCurPos + length)
     return self.find_tag('APP1')
Exemple #6
0
 def tag_app0(self, tag):
     # 0xFFE0 APP0
     length = self.read_uint16()
     magic = self.fileObject.read(5)
     if magic != 'JFIF\x00':
         LOGGER.warning('[0x%x] Unbale to process magic %s in APP0.' %
                        (self.fileObject.cur(), magic))
     self.version = self.read_uint16()
     self.fileObject.read(5)
     self.thumbnailX = self.fileObject.read_uint8()
     self.thumbnailY = self.fileObject.read_uint8()
     self.thumbnail = self.fileObject.read(length - 16)  # RGB pixel
     return self.find_tag('APP0')
Exemple #7
0
 def read_tiff(self, length, tagName):
     tiffStartPos = self.fileObject.cur()
     if self.fileObject.read(2) == 'II':
         p_read_uint16 = self.fileObject.read_uint16
         p_read_uint32 = self.fileObject.read_uint32
     else:
         p_read_uint16 = self.read_uint16
         p_read_uint32 = self.read_uint32
     if p_read_uint16() != 0x2a:
         LOGGER.warning('[0x%x] TIFF data format magic check failed.' %
                        tiffStartPos)
     dirEntryPos = p_read_uint32()
     self.read_tiff_ifd(tiffStartPos, p_read_uint16, p_read_uint32,
                        dirEntryPos, tagName)
Exemple #8
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.BUDGET.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} budgets have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more budgets')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        if not re.search(r'\d', title):
                            sufix = element.findPreviousSibling('h2').text
                            title = f'{title} {sufix}'

                        model, created = session.get_or_create(Budget,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Budget "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #9
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.REPORT.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left tr')
                    LOGGER.info('{} reports have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more reports')
                        return

                    for element in elements:
                        date, title = element.findChildren('td')

                        title = re.sub(r'[\n\s]+', ' ', title.text)

                        model, created = session.get_or_create(
                            Report,
                            date=date.text.strip(),
                            title=title.strip())
                        if not created:
                            LOGGER.info(
                                f'Report "{model.title[:30]}..." ...skipped (duplicate)'
                            )
                        else:
                            LOGGER.info(f'"{model.title[:40]}..." ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
    def __init__(self, file_object):
        # file_object.addHandler(logging.StreamHandler())
        self.fileObject = file_object
        if file_object.read(3) != 'GIF':
            LOGGER.error("File is not a gif file")
        self.type = "GIF"
        self.version = file_object.read(3)
        if self.version != '87a' and self.version != '89a':
            LOGGER.log(CustomLoggingLevel.OTHER_DATA, "Invalid version")
        else:
            LOGGER.log(CustomLoggingLevel.BASIC_DEBUG,
                       "version is " + self.version)
        self.logicScreenWidth = file_object.read_uint16()
        self.logicScreenHeight = file_object.read_uint16()
        mask = file_object.read_uint8()
        self.pixel = mask & 0b111
        mask >>= 3
        self.sortFlag = mask & 0b1
        mask >>= 1
        self.colorResolution = mask & 0b111
        mask >>= 3
        self.globalColorTableFlag = mask & 0b1
        if self.version == "89a":
            self.backgroundColorIndex = file_object.read_uint8()
            self.pixelAspectRatio = file_object.read_uint8()
        # self.globalColorTable = [[0, 0, 0]] * (2 ** (self.pixel + 1))
        if self.globalColorTableFlag:
            self.globalColorTable = [[0, 0, 0]
                                     for _ in range(2**(self.pixel + 1))]
        else:
            self.globalColorTable = []

        LOGGER.log(CustomLoggingLevel.OTHER_DATA,
                   "global table size is %d" % len(self.globalColorTable))

        for i in range(len(self.globalColorTable)):

            for j in range(3):  # 0 red 1 green 2 blue
                self.globalColorTable[i][j] = file_object.read_uint8()
        self.images = []
        image = {}
        while True:
            tag = file_object.read_uint8()

            if tag == 0x3b:
                LOGGER.log(CustomLoggingLevel.OTHER_DATA, "gif end")
                break  # end of gif

            if tag == 0x2c:  # start of a image descriptor
                # LOGGER.info("image descriptor")
                LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "image descriptor")
                image["xOffset"] = file_object.read_uint16()
                image["yOffset"] = file_object.read_uint16()

                image["width"] = file_object.read_uint16()
                image["height"] = file_object.read_uint16()

                if image["xOffset"] + image["width"] > self.logicScreenWidth or \
                   image["yOffset"] + image["height"] > self.logicScreenHeight:
                    LOGGER.log(
                        CustomLoggingLevel.OTHER_DATA,
                        "some part out of logic screen at image %d" %
                        len(self.images) + 1)

                mask = file_object.read_uint8()
                image["pixel"] = mask & 0b111
                mask >>= 3
                image["reserved"] = mask & 0b11
                if image["reserved"] != 0:
                    LOGGER.log(
                        CustomLoggingLevel.OTHER_DATA,
                        "[0x%x] reserved data should be 0" %
                        self.fileObject.cur())
                mask >>= 2
                image["sortFlag"] = mask & 0b1
                mask >>= 1
                image["interlaceFlag"] = mask & 0b1
                mask >>= 1
                image["localColorTableFlag"] = mask & 0b1
                if image["localColorTableFlag"]:
                    image["localColorTable"] = [
                        [0, 0, 0] for _ in xrange((2**(image["pixel"] + 1)))
                    ]
                    for i in range(len(image["localColorTable"])):
                        for j in range(3):  # 0 red 1 green 2 blue
                            image["localColorTable"][i][
                                j] = file_object.read_uint8()
            elif tag == 0x21:
                if self.version != "89a":
                    LOGGER.log(CustomLoggingLevel.OTHER_DATA,
                               "not version 89a but has extension segment.")
                sub_tag = file_object.read_uint8()
                if sub_tag == 0xF9:  # Graphic Control Extension.
                    LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG,
                               "Graphic Control Extension")
                    block_size = file_object.read_uint8()
                    if block_size != 4:
                        LOGGER.log(
                            CustomLoggingLevel.OTHER_DATA,
                            "block size is not 4 in Graphic Control Extension")
                    control = {}
                    mask = file_object.read_uint8()
                    control["transparentFlag"] = mask & 0b1
                    mask >>= 1
                    control["userInputFlag"] = mask & 0b1
                    mask >>= 1
                    control["disposalMethod"] = mask & 0b111
                    # 0 -   No disposal specified. The decoder is
                    # not required to take any action.
                    # 1 -   Do not dispose. The graphic is to be left
                    # in place.
                    # 2 -   Restore to background color. The area used by the
                    #           graphic must be restored to the background color.
                    #     3 -   Restore to previous. The decoder is required to
                    #           restore the area overwritten by the graphic with
                    #           what was there prior to rendering the graphic.
                    #  4-7 -    To be defined.
                    control["delayTime"] = file_object.read_uint16()
                    control["TransparentColonrIndex"] = file_object.read_uint8(
                    )
                    terminator = file_object.read_uint8()
                    if terminator != 0:
                        LOGGER.log(
                            CustomLoggingLevel.OTHER_DATA,
                            "[0x%x] terminator in block Graphic Control Extension is not 0"
                            % self.fileObject.cur())
                    image["control"] = control
                elif sub_tag == 0xFE:  # Comment Extension.
                    LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG,
                               "Comment Extension.")
                    if "comment" not in image:
                        image["comment"] = ""
                    while True:
                        tmp = file_object.read(1)
                        if tmp == '\0':
                            break
                        image["comment"] += tmp
                    LOGGER.log(CustomLoggingLevel.ASCII_DATA, image["comment"])
                elif sub_tag == 0x01:  # plain text Extension
                    LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG,
                               "plain text Extension")
                    block_size = file_object.read_uint8()
                    if block_size != 12:
                        LOGGER.warning("block size is not 12 in plain text")
                    text = {
                        "gridLeftPosition": file_object.read_uint16(),
                        "gridTopPosition": file_object.read_uint16(),
                        "textGridWidth": file_object.read_uint16(),
                        "textGridHeight": file_object.read_uint16(),
                        "characterCellWidth": file_object.read_uint8(),
                        "characterCellHeight": file_object.read_uint8(),
                        "textForegroundColorIndex": file_object.read_uint8(),
                        "textBackgroundColorIndex": file_object.read_uint8(),
                        "data": ""
                    }
                    while True:
                        tmp = file_object.read(1)
                        if tmp == '\0':
                            break
                        text["data"] += tmp
                    if "text" in image:
                        LOGGER.log(CustomLoggingLevel.OTHER_DATA,
                                   "text already in image")
                    image["text"] = text
                    LOGGER.log(CustomLoggingLevel.ASCII_DATA, image["text"])
                elif sub_tag == 0xFF:  # Application Extension.
                    LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG,
                               "Application Extension.")
                    block_size = file_object.read_uint8()
                    if block_size != 11:
                        LOGGER.log(
                            CustomLoggingLevel.OTHER_DATA,
                            "[0x%x] block size is not 11 in application extension"
                            % self.fileObject.cur())
                    application = {
                        "identifier": file_object.read(8),
                        "authenticationCode": file_object.read(3)
                    }
                    data_size = file_object.read_uint8()
                    application["data"] = file_object.read(data_size)

                    if "application" in image:
                        LOGGER.log(CustomLoggingLevel.OTHER_DATA,
                                   "application Extension already in image")

                    image["application"] = application
                    terminator = file_object.read_uint8()
                    if terminator != 0:
                        LOGGER.log(
                            CustomLoggingLevel.OTHER_DATA,
                            "terminator is not 0 in Application Extension")
                else:
                    LOGGER.log(
                        CustomLoggingLevel.IMAGE_DEBUG,
                        "[0x%x] unknown extension at" % self.fileObject.cur())
            else:  # DATA
                # LOGGER.info("DATA")
                LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, "DATA")
                image["LZWMinimumCodeSize"] = tag

                image["data"] = []
                while True:
                    data_size = file_object.read_uint8()

                    if data_size == 0:
                        break
                    data = file_object.read(data_size)
                    image["data"] += data
                self.images.append(image)
                image = {}
Exemple #11
0
    def rowdata_ver23(self):
        rowData = []
        if self.compressionMethod != 0:
            # decompress bitmap data according to compression method
            if self.bitmapLength == 0:
                LOGGER.warning(
                    'BitmapLength shouldn\'t be 0 in bitmap header! There may have some extra data in end of the file.'
                )
                tdata = self.fileObject.read(self.fileObject.size -
                                             self.headerLength)
            else:
                tdata = self.fileObject.read(self.bitmapLength)
            # decompress
            data = []
            if self.compressionMethod == 1:
                specialFlag = -1
                for i in range(len(tdata)):
                    if specialFlag < 0:
                        if specialFlag == -1:
                            if tdata[i] == '\x00':
                                pass  # end of line
                            elif tdata == '\x01':
                                break  # end of RLE data
                            elif tdata[i] == '\x02':
                                data.append('\x00' *
                                            (ord(tdata[i + 1]) +
                                             self.width * ord(tdata[i + 2])) *
                                            self.bitsPerPixel / 8)
                            else:
                                specialFlag = ord(tdata[i]) + 1
                        specialFlag -= 1
                        if specialFlag == -3:
                            specialFlag = 0
                    elif specialFlag == 0:
                        if tdata[i] == '\x00':
                            specialFlag = -1
                    elif specialFlag > 1:
                        data.append(tdata[i])
                        specialFlag -= 1
                    else:
                        specialFlag -= 1
                if i < len(tdata) - 1:
                    self.showextradata(tdata[i:len(tdata) - 1],
                                       self.headerLength + i)
                data = ''.join(data)
            elif self.compressionMethod == 2:
                LOGGER.error(
                    'Compress method RLE4 of BMP file version 3 is not surported.'
                )
                return
            elif self.compressionMethod == 3:
                LOGGER.error(
                    'Compress method using RGB mask of BMP file version 3 is not surported.'
                )
                return
        else:
            data = self.fileObject.read(self.rowDataLength)

        if self.compressionMethod == 0 and self.bitmapLength != 0:
            LOGGER.warning(
                'BitmapLength should be 0 in bitmap header! Image pixel may be processed with wrong compress method!'
            )
        if self.bitsPerPixel in [1, 4, 8, 24, 32]:
            # return row data from stream
            if self.bitsPerPixel == 24:
                self.channel = 3
            else:
                self.channel = 4
            return self.decode_rgb_data(data)
        else:
            LOGGER.error(
                'BMP file bits per pixel is not in (1, 4, 8, 24, 32).')