def parse_zagreb_document_link(docu_url: str) -> tuple: site = requests_retry_session().get(root_url + docu_url).content soup = BeautifulSoup(site, 'html.parser') docu_text = soup.select('tr td b font') docu_link = soup.select('tr td a')[0].attrs['href'] # Strip all <br/> from soup for br in soup.findAll('br'): br.extract() # Get document title docu_title = '' for sub_docu_text in docu_text: if sub_docu_text.contents: if sub_docu_text.contents[0] != 'Dodatni opis': docu_title += sub_docu_text.contents[0] + ' ' docu_file_type = 'unknown' if docu_link.endswith('.docx'): docu_raw_data = extract_docxfile_data(root_url + docu_link) docu_file_type = 'docx' elif docu_link.endswith('.pdf'): docu_raw_data = extract_pdffile_data(root_url + docu_link) docu_file_type = 'pdf' else: # For old Word documents and other file types docu_raw_data = 'The search engine could not extract data from this file.' \ ' Navigate to this URL to download the file: {}'.format(root_url + docu_link) return docu_title, docu_raw_data, docu_file_type
def extract_pdffile_data(url_pdf: str) -> str: response = requests_retry_session().get(url_pdf) file = io.BytesIO(response.content) pdf_reader = fitz.open(stream=file, filetype='pdf') pdf_raw_data = '' for page in range(pdf_reader.pageCount): pdf_raw_data += pdf_reader.loadPage(page).getText() + '\n' return pdf_raw_data
def parse_subject_details(url: str) -> dict: site = requests_retry_session().get(url).content soup = BeautifulSoup(site, 'html.parser') text = get_visible_text(soup) subject_details = {'text': text} act_titles = [el.get_text().strip() for el in soup.select('td a')] act_urls = [el.attrs['href'].lower() for el in soup.select('td a')] acts = [] for i, act_url in enumerate(act_urls): site = requests_retry_session().get(root_url + act_url).content soup = BeautifulSoup(site, 'html.parser') act_content = get_visible_text(soup) act_title = act_titles[i] acts.append( { 'act_content': act_content, 'act_url': act_url, 'act_title': act_title, 'act_file_type': 'HTML', } ) # Check for word or pdf attachments if "<a href='" in text: # Regex to extract link to document docu_urls = re.findall("<a href='(.*)','Dokument", text) for docu_url in docu_urls: docu_title, docu_raw_data, docu_file_type = parse_zagreb_document_link(docu_url) acts.append( { 'act_content': docu_raw_data, 'act_url': docu_url, 'act_title': docu_title, 'act_file_type': docu_file_type, } ) subject_details['acts'] = acts return subject_details
def parse_subjects_list(url: str) -> tuple: site = requests_retry_session().get(url).content soup = BeautifulSoup(site, 'html.parser') table_items = soup.select('.centralTD ul ol li') print(len(table_items), ' elements') subjects = [] for table_item in table_items: content = table_item.contents[0] if hasattr(content, 'href'): link = content.attrs['href'].lower() try: subject_title = content.select('b')[0].contents[0] subjects.append({'subject_title': subject_title, 'subject_url': root_url + link}) except AttributeError as e: print('Error occured while extracting subject title from {}: {}'.format(content, e)) return subjects, len(table_items)
def extract_months(year: str, url: str) -> list: payload = { '__Click': '$Refresh', 'SaveOptions': 0, 'server': 'szglotweb', 'trazi': '', '% % Surrogate_rb_godina': 1, 'rb_godina': year, '% % Surrogate_rb_sjednice': 1, } site = requests_retry_session().post(url, data=payload).content soup = BeautifulSoup(site, 'html.parser') months_soup = soup.find("select", {'name': 'rb_sjednice'}).findAll('option') month_list = months_soup[0].text.split('\n') # Remove any empty items from month_list month_list = list(filter(None, month_list)) return month_list
def extract_dates(year_range: str, url_suffix: str) -> None: url = root_url + url_suffix months_on_file = list(ScraperPeriod.objects.all().values_list('period_text', flat=True)) site = requests_retry_session().get(url).content soup = BeautifulSoup(site, 'html.parser') years_soup = soup.find("select", {'name': 'rb_godina'}).findAll('option') year_list = years_soup[0].text.split('\n') for year in year_list: month_list = extract_months(year, url) for month in month_list: if month not in months_on_file: start_date, end_date = parse_date_range(month) print('Adding {} to {}'.format(month, year_range)) ScraperPeriod.objects.create( period_text=month, year_range=year_range, start_date=start_date, end_date=end_date, )
def get_visible_text(soup) -> str: # Additional check for pages with JavaScript redirects if 'location.replace(' in soup.getText(): # Regex to extract redirect URL from the 'else' branch of Javascript code result = re.search('else\n {4}location.replace[(]"(.*)"[)];', soup.getText()) act_url = result.group(1) site = requests_retry_session().get(root_url + act_url).content soup = BeautifulSoup(site, 'html.parser') text = get_visible_text(soup) return text # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out text = soup.getText() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text
def scrape_new_periods(dummy='dummy'): """ Searches for any new new periods on the Grad Split website and adds them to the database. Note: The dummy input is required for requests_patcher (mock requests) to work. """ print('Searching for any new date ranges to be scraped from Grad Split database...') full_url = root_url + '/gradska-uprava/gradonacelnik/akti-gradonacelnika' site = requests_retry_session().get(full_url).content soup = BeautifulSoup(site, 'html.parser') raw_urls = soup.select('.c-documents-list__item-link') # Create dictionary of links:titles raw_urls_dict = { url['href']: # dict key of hrefs text_to_date(url.div.get_text(strip=True)) # dict value of titles in datetime format for url in raw_urls } raw_urls_set = {k for k in raw_urls_dict} existing_urls = set(ScraperPeriod.objects.values_list('url', flat=True)) new_urls = raw_urls_set.difference(existing_urls) # create new ScraperPeriod objects if required if new_urls: for url in new_urls: date = raw_urls_dict[url] # fix bug for date in 2107 if date.strftime('%Y') == '2107': date = date.replace(year=2017) print(f"Found new period: {date.strftime('%d %b %Y')}...") ScraperPeriod.objects.create( url=url, date=date, date_text=date.strftime('%d.%b.%Y') )
def extract_docxfile_data(url_docx: str) -> str: response = requests_retry_session().get(url_docx) file = io.BytesIO(response.content) return docx2txt.process(file)
def test_function_extracts_correct_data_from_page_with_hidden_redirect(self): site = requests_retry_session().get(self.test_url_with_redirect).content soup = BeautifulSoup(site, 'html.parser') text = scrape_utils_html.get_visible_text(soup) self.assertIn('Gradonačelnik Grada Zagreba', text)