Exemple #1
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.NEWSLETTER.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} newsletters have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more newsletters')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Newsletter,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Newsletter "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #2
0
def cli():
    return_code = 0
    LOGGER.info('Start')

    MeetingPlugin().run()
    AnnouncementPlugin().run()
    NewsletterPlugin().run()
    BudgetPlugin().run()
    ProcurementPlugin().run()
    ReportPlugin().run()
    OrderPlugin().run()
    InvoicePlugin().run()
    ContractPlugin().run()
    TablePlugin().run()
    ResolutionPlugin().run()
    TranscriptPlugin().run()
    VZNPlugin().run()

    LOGGER.info('Done')
    return return_code
Exemple #3
0
    def __init__(self, max_retries: int = 5):
        LOGGER.info("Creating browser session")
        self.session = Session()

        LOGGER.info("Injecting headers into the browser")
        self.session.headers.update({
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/88.0",
            "Accept-Language":
            "sk,en-US;q=0.7,en;q=0.3",
        })

        if max_retries:
            retry_strategy = Retry(total=max_retries)

            adapter = HTTPAdapter(max_retries=retry_strategy)
            self.session.mount("https://", adapter)
            self.session.mount("http://", adapter)

        super().__init__()
Exemple #4
0
    def request(self):
        with Database() as session:
            with Browser() as browser_session:
                url = self.url(Category.INVOICE.value)
                loop = True
                while loop:
                    try:
                        response = browser_session.get(url)
                        response.raise_for_status()
                        soup = bs4.BeautifulSoup(response.content,
                                                 'html.parser')

                        elements = soup.select(
                            '#dokumenty table.tabulka tr:not(.hlavicka)')

                        if len(elements) == 0:
                            LOGGER.info('Done, no more invoices')
                            return

                        for element in elements:
                            published, _, title, _, _, _, document = element.findChildren(
                                'td')
                            link = document.findChild('a').attrs.get('href')
                            size_in_mb = re.search(r'([0-9\.]+)',
                                                   document.text).groups()[0]
                            is_pdf = re.search(r'\.pdf$', url)
                            if is_pdf:
                                model, created = session.get_or_create(
                                    Invoice,
                                    published=datetime.date.fromisoformat(
                                        published.text),
                                    title=title.text,
                                    url=link,
                                    size_in_mb=size_in_mb)
                                if not created:
                                    LOGGER.info(
                                        f'Invoice {model.url} ...skipped (duplicate)'
                                    )
                                    loop = False
                                else:
                                    LOGGER.info(f'{model.url} ...added')
                            else:
                                LOGGER.warning(
                                    f'Invoice {model.url} ...skipped (not PDF)'
                                )
                        next_url = soup.select_one(
                            '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)'
                        ).attrs.get('href')
                        next_url = urljoin(self.base_url(response.url),
                                           next_url)

                        # FIXME: first page can be w/o the page number
                        if next_url == url:
                            return
                        url = next_url
                    except requests.exceptions.HTTPError as exc:
                        LOGGER.warning('Scraping {} {} ...skipping'.format(
                            url, exc.response.status_code))
                        raise exc
Exemple #5
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.BUDGET.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} budgets have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more budgets')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        if not re.search(r'\d', title):
                            sufix = element.findPreviousSibling('h2').text
                            title = f'{title} {sufix}'

                        model, created = session.get_or_create(Budget,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Budget "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #6
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.REPORT.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left tr')
                    LOGGER.info('{} reports have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more reports')
                        return

                    for element in elements:
                        date, title = element.findChildren('td')

                        title = re.sub(r'[\n\s]+', ' ', title.text)

                        model, created = session.get_or_create(
                            Report,
                            date=date.text.strip(),
                            title=title.strip())
                        if not created:
                            LOGGER.info(
                                f'Report "{model.title[:30]}..." ...skipped (duplicate)'
                            )
                        else:
                            LOGGER.info(f'"{model.title[:40]}..." ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #7
0
 def after_request(self):
     LOGGER.info('Finished scrapping contracts')
Exemple #8
0
 def before_request(self):
     LOGGER.info('Start scrapping contracts')
Exemple #9
0
    def decode_scandata(self):
        # init decode varible
        horzY = self.colorQuantization[1]['Horz']
        horzCr = self.colorQuantization[2]['Horz']
        horzCb = self.colorQuantization[3]['Horz']
        vertY = self.colorQuantization[1]['Vert']
        vertCr = self.colorQuantization[2]['Vert']
        vertCb = self.colorQuantization[3]['Vert']
        scanY = horzY * vertY
        scanCr = horzCr * vertCr
        scanCb = horzCb * vertCb
        LOGGER.log(
            CustomLoggingLevel.IMAGE_DEBUG,
            "scanY: %d, scanCr: %d, scanCb: %d." % (scanY, scanCr, scanCb))
        self.baseY = 0
        self.baseCr = 0
        self.baseCb = 0
        rowData = [[0, 0, 0] for i in range(self.width * self.height)]

        if vertCb != 1 or vertCr != 1 or horzCb != 1 or horzCr != 1:
            LOGGER.error(
                'Error in decode scan data, ONLY support vertCb==vertCr==horzCb==horzCr==1!'
            )
        else:
            LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG,
                       'Start to decode scan data.')

        # calc block number
        hBlock = self.width / (horzY * 8)
        if self.width % (horzY * 8) != 0:
            hBlock += 1
        vBlock = self.height / (vertY * 8)
        if self.height % (vertY * 8) != 0:
            vBlock += 1

        widthIndex = 0
        heightIndex = 0
        unsualDataFlagRight = False
        unsualDataFlagBotton = False
        for vb in range(vBlock):
            for hb in range(hBlock):
                dataY = []
                dataCr = []
                dataCb = []
                [dataY, dataCr,
                 dataCb] = self.read_mcu_block(scanY, scanCr, scanCb, dataY,
                                               dataCr, dataCb)
                for i in range(vertY):
                    heightIndex = (vb * vertY + i) * 8
                    for j in range(horzY):
                        widthIndex = (hb * horzY + j) * 8
                        for k in range(8):
                            if heightIndex + k < self.height:
                                for l in range(8):
                                    if widthIndex + l < self.width:
                                        y = dataY[i * horzY + j][k * 8 + l]
                                        cr = dataCr[0][k * 8 + l]
                                        cb = dataCb[0][k * 8 + l]
                                        r = self.round(y + 1.402 * cr + 128)
                                        b = self.round(y + 1.772 * cb + 128)
                                        g = self.round(y - 0.34414 * cb -
                                                       0.71414 * cr + 128)
                                        rowData[(heightIndex + k) * self.width
                                                + widthIndex + l] = [r, g, b]
            LOGGER.info('Please wait, decoding ... (%d/%d)' % (vb, vBlock))
        self.clean_bitstream_remainder()
        if self.scanDataIndex < self.scanDataLength:
            self.showextradata(''.join(self.scanData[self.scanDataIndex:]),
                               self.scanDataPos + self.scanDataIndex)
        return rowData
Exemple #10
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.PROCUREMENT.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    main_element = soup.select_one('#content-left ol')
                    procurements = main_element.select('a')
                    offers = main_element.find_next_siblings('a')
                    LOGGER.info('{} procurements have been found'.format(
                        len(procurements)))

                    if len(procurements) == 0:
                        LOGGER.info('Done, no more procurements')

                    for element in procurements:
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Procurement,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Procurement "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')

                    LOGGER.info('{} offers have been found'.format(
                        len(offers)))

                    if len(offers) == 0:
                        LOGGER.info('Done, no more offers')
                        return

                    for element in offers:
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Procurement,
                                                               url=href,
                                                               title=title,
                                                               is_offer=True)
                        if not created:
                            LOGGER.info(
                                f'Offer "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemple #11
0
 def before_request(self):
     LOGGER.info('Start scrapping invoices')
Exemple #12
0
 def before_request(self):
     LOGGER.info('Start scrapping meetings')
Exemple #13
0
 def _log_post_request(self, response):
     LOGGER.info(f'Response status: {response.status}')
     LOGGER.info(f'Response json: {response.json}')
     return response
Exemple #14
0
 def before_request(self):
     LOGGER.info('Start scrapping announcements')
Exemple #15
0
 def after_request(self):
     LOGGER.info('Finished scrapping table docs')
Exemple #16
0
 def before_request(self):
     LOGGER.info('Start scrapping table docs')
Exemple #17
0
 def after_request(self):
     LOGGER.info('Finished scrapping reports')
Exemple #18
0
 def before_request(self):
     LOGGER.info('Start scrapping reports')
Exemple #19
0
 def after_request(self):
     LOGGER.info('Finished scrapping procurements')
Exemple #20
0
 def after_request(self):
     LOGGER.info('Finished scrapping meetings')
Exemple #21
0
 def _log_before_request(self, *args, **kwargs):
     LOGGER.info(f'Request for endpoint: {request.full_path}')
     LOGGER.info(f'With args: {request.args}')
Exemple #22
0
 def after_request(self):
     LOGGER.info('Finished scrapping announcements')
Exemple #23
0
 def before_request(self):
     LOGGER.info('Start scrapping newsletter')
Exemple #24
0
 def after_request(self):
     LOGGER.info('Finished scrapping invoices')
Exemple #25
0
 def after_request(self):
     LOGGER.info('Finished scrapping newsletter')
Exemple #26
0
 def after_request(self):
     LOGGER.info('Finished scrapping orders')
Exemple #27
0
 def after_request(self):
     LOGGER.info('Finished scrapping resolutions')
Exemple #28
0
 def before_request(self):
     LOGGER.info('Start scrapping procurement')
Exemple #29
0
plugin_folder = project_path('commands')


class CLI(click.MultiCommand):
    def list_commands(self, ctx):
        rv = []
        for filename in os.listdir(plugin_folder):
            if filename.endswith('.py'):
                rv.append(filename[:-3])
        rv.sort()
        return rv

    def get_command(self, ctx, name):
        ns = {}
        fn = os.path.join(plugin_folder, name + '.py')
        with open(fn) as f:
            code = compile(f.read(), fn, 'exec')
            eval(code, ns, ns)
        return ns['cli']


cli = CLI()

if __name__ == '__main__':
    try:
        LOGGER.info(' '.join(sys.argv))
        sys.exit(cli(standalone_mode=False))
    except Exception as e:
        LOGGER.exception(e)
Exemple #30
0
 def before_request(self):
     LOGGER.info('Start scrapping resolutions')