def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.NEWSLETTER.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} newsletters have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more newsletters') return for element in reversed(elements): href = element['href'] title = element.text model, created = session.get_or_create(Newsletter, url=href, title=title) if not created: LOGGER.info( f'Newsletter "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def cli(): return_code = 0 LOGGER.info('Start') MeetingPlugin().run() AnnouncementPlugin().run() NewsletterPlugin().run() BudgetPlugin().run() ProcurementPlugin().run() ReportPlugin().run() OrderPlugin().run() InvoicePlugin().run() ContractPlugin().run() TablePlugin().run() ResolutionPlugin().run() TranscriptPlugin().run() VZNPlugin().run() LOGGER.info('Done') return return_code
def __init__(self, max_retries: int = 5): LOGGER.info("Creating browser session") self.session = Session() LOGGER.info("Injecting headers into the browser") self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/88.0", "Accept-Language": "sk,en-US;q=0.7,en;q=0.3", }) if max_retries: retry_strategy = Retry(total=max_retries) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("https://", adapter) self.session.mount("http://", adapter) super().__init__()
def request(self): with Database() as session: with Browser() as browser_session: url = self.url(Category.INVOICE.value) loop = True while loop: try: response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select( '#dokumenty table.tabulka tr:not(.hlavicka)') if len(elements) == 0: LOGGER.info('Done, no more invoices') return for element in elements: published, _, title, _, _, _, document = element.findChildren( 'td') link = document.findChild('a').attrs.get('href') size_in_mb = re.search(r'([0-9\.]+)', document.text).groups()[0] is_pdf = re.search(r'\.pdf$', url) if is_pdf: model, created = session.get_or_create( Invoice, published=datetime.date.fromisoformat( published.text), title=title.text, url=link, size_in_mb=size_in_mb) if not created: LOGGER.info( f'Invoice {model.url} ...skipped (duplicate)' ) loop = False else: LOGGER.info(f'{model.url} ...added') else: LOGGER.warning( f'Invoice {model.url} ...skipped (not PDF)' ) next_url = soup.select_one( '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)' ).attrs.get('href') next_url = urljoin(self.base_url(response.url), next_url) # FIXME: first page can be w/o the page number if next_url == url: return url = next_url except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.BUDGET.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left a') LOGGER.info('{} budgets have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more budgets') return for element in reversed(elements): href = element['href'] title = element.text if not re.search(r'\d', title): sufix = element.findPreviousSibling('h2').text title = f'{title} {sufix}' model, created = session.get_or_create(Budget, url=href, title=title) if not created: LOGGER.info( f'Budget "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.REPORT.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') elements = soup.select('#content-left tr') LOGGER.info('{} reports have been found'.format( len(elements))) if len(elements) == 0: LOGGER.info('Done, no more reports') return for element in elements: date, title = element.findChildren('td') title = re.sub(r'[\n\s]+', ' ', title.text) model, created = session.get_or_create( Report, date=date.text.strip(), title=title.strip()) if not created: LOGGER.info( f'Report "{model.title[:30]}..." ...skipped (duplicate)' ) else: LOGGER.info(f'"{model.title[:40]}..." ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def after_request(self): LOGGER.info('Finished scrapping contracts')
def before_request(self): LOGGER.info('Start scrapping contracts')
def decode_scandata(self): # init decode varible horzY = self.colorQuantization[1]['Horz'] horzCr = self.colorQuantization[2]['Horz'] horzCb = self.colorQuantization[3]['Horz'] vertY = self.colorQuantization[1]['Vert'] vertCr = self.colorQuantization[2]['Vert'] vertCb = self.colorQuantization[3]['Vert'] scanY = horzY * vertY scanCr = horzCr * vertCr scanCb = horzCb * vertCb LOGGER.log( CustomLoggingLevel.IMAGE_DEBUG, "scanY: %d, scanCr: %d, scanCb: %d." % (scanY, scanCr, scanCb)) self.baseY = 0 self.baseCr = 0 self.baseCb = 0 rowData = [[0, 0, 0] for i in range(self.width * self.height)] if vertCb != 1 or vertCr != 1 or horzCb != 1 or horzCr != 1: LOGGER.error( 'Error in decode scan data, ONLY support vertCb==vertCr==horzCb==horzCr==1!' ) else: LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, 'Start to decode scan data.') # calc block number hBlock = self.width / (horzY * 8) if self.width % (horzY * 8) != 0: hBlock += 1 vBlock = self.height / (vertY * 8) if self.height % (vertY * 8) != 0: vBlock += 1 widthIndex = 0 heightIndex = 0 unsualDataFlagRight = False unsualDataFlagBotton = False for vb in range(vBlock): for hb in range(hBlock): dataY = [] dataCr = [] dataCb = [] [dataY, dataCr, dataCb] = self.read_mcu_block(scanY, scanCr, scanCb, dataY, dataCr, dataCb) for i in range(vertY): heightIndex = (vb * vertY + i) * 8 for j in range(horzY): widthIndex = (hb * horzY + j) * 8 for k in range(8): if heightIndex + k < self.height: for l in range(8): if widthIndex + l < self.width: y = dataY[i * horzY + j][k * 8 + l] cr = dataCr[0][k * 8 + l] cb = dataCb[0][k * 8 + l] r = self.round(y + 1.402 * cr + 128) b = self.round(y + 1.772 * cb + 128) g = self.round(y - 0.34414 * cb - 0.71414 * cr + 128) rowData[(heightIndex + k) * self.width + widthIndex + l] = [r, g, b] LOGGER.info('Please wait, decoding ... (%d/%d)' % (vb, vBlock)) self.clean_bitstream_remainder() if self.scanDataIndex < self.scanDataLength: self.showextradata(''.join(self.scanData[self.scanDataIndex:]), self.scanDataPos + self.scanDataIndex) return rowData
def request(self): try: with Database() as session: with Browser() as browser_session: url = self.url(Category.PROCUREMENT.value) response = browser_session.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') main_element = soup.select_one('#content-left ol') procurements = main_element.select('a') offers = main_element.find_next_siblings('a') LOGGER.info('{} procurements have been found'.format( len(procurements))) if len(procurements) == 0: LOGGER.info('Done, no more procurements') for element in procurements: href = element['href'] title = element.text model, created = session.get_or_create(Procurement, url=href, title=title) if not created: LOGGER.info( f'Procurement "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') LOGGER.info('{} offers have been found'.format( len(offers))) if len(offers) == 0: LOGGER.info('Done, no more offers') return for element in offers: href = element['href'] title = element.text model, created = session.get_or_create(Procurement, url=href, title=title, is_offer=True) if not created: LOGGER.info( f'Offer "{href}" ...skipped (duplicate)') else: LOGGER.info(f'{href} ...added') except requests.exceptions.HTTPError as exc: LOGGER.warning('Scraping {} {} ...skipping'.format( url, exc.response.status_code)) raise exc
def before_request(self): LOGGER.info('Start scrapping invoices')
def before_request(self): LOGGER.info('Start scrapping meetings')
def _log_post_request(self, response): LOGGER.info(f'Response status: {response.status}') LOGGER.info(f'Response json: {response.json}') return response
def before_request(self): LOGGER.info('Start scrapping announcements')
def after_request(self): LOGGER.info('Finished scrapping table docs')
def before_request(self): LOGGER.info('Start scrapping table docs')
def after_request(self): LOGGER.info('Finished scrapping reports')
def before_request(self): LOGGER.info('Start scrapping reports')
def after_request(self): LOGGER.info('Finished scrapping procurements')
def after_request(self): LOGGER.info('Finished scrapping meetings')
def _log_before_request(self, *args, **kwargs): LOGGER.info(f'Request for endpoint: {request.full_path}') LOGGER.info(f'With args: {request.args}')
def after_request(self): LOGGER.info('Finished scrapping announcements')
def before_request(self): LOGGER.info('Start scrapping newsletter')
def after_request(self): LOGGER.info('Finished scrapping invoices')
def after_request(self): LOGGER.info('Finished scrapping newsletter')
def after_request(self): LOGGER.info('Finished scrapping orders')
def after_request(self): LOGGER.info('Finished scrapping resolutions')
def before_request(self): LOGGER.info('Start scrapping procurement')
plugin_folder = project_path('commands') class CLI(click.MultiCommand): def list_commands(self, ctx): rv = [] for filename in os.listdir(plugin_folder): if filename.endswith('.py'): rv.append(filename[:-3]) rv.sort() return rv def get_command(self, ctx, name): ns = {} fn = os.path.join(plugin_folder, name + '.py') with open(fn) as f: code = compile(f.read(), fn, 'exec') eval(code, ns, ns) return ns['cli'] cli = CLI() if __name__ == '__main__': try: LOGGER.info(' '.join(sys.argv)) sys.exit(cli(standalone_mode=False)) except Exception as e: LOGGER.exception(e)
def before_request(self): LOGGER.info('Start scrapping resolutions')