Esempio n. 1
0
def parse_zagreb_document_link(docu_url: str) -> tuple:
    site = requests_retry_session().get(root_url + docu_url).content
    soup = BeautifulSoup(site, 'html.parser')
    docu_text = soup.select('tr td b font')
    docu_link = soup.select('tr td a')[0].attrs['href']

    # Strip all <br/> from soup
    for br in soup.findAll('br'):
        br.extract()

    # Get document title
    docu_title = ''
    for sub_docu_text in docu_text:
        if sub_docu_text.contents:
            if sub_docu_text.contents[0] != 'Dodatni opis':
                docu_title += sub_docu_text.contents[0] + ' '

    docu_file_type = 'unknown'
    if docu_link.endswith('.docx'):
        docu_raw_data = extract_docxfile_data(root_url + docu_link)
        docu_file_type = 'docx'
    elif docu_link.endswith('.pdf'):
        docu_raw_data = extract_pdffile_data(root_url + docu_link)
        docu_file_type = 'pdf'
    else:
        # For old Word documents and other file types
        docu_raw_data = 'The search engine could not extract data from this file.' \
                        ' Navigate to this URL to download the file: {}'.format(root_url + docu_link)
    return docu_title, docu_raw_data, docu_file_type
Esempio n. 2
0
def extract_pdffile_data(url_pdf: str) -> str:
    response = requests_retry_session().get(url_pdf)
    file = io.BytesIO(response.content)
    pdf_reader = fitz.open(stream=file, filetype='pdf')
    pdf_raw_data = ''
    for page in range(pdf_reader.pageCount):
        pdf_raw_data += pdf_reader.loadPage(page).getText() + '\n'
    return pdf_raw_data
Esempio n. 3
0
def parse_subject_details(url: str) -> dict:
    site = requests_retry_session().get(url).content
    soup = BeautifulSoup(site, 'html.parser')

    text = get_visible_text(soup)
    subject_details = {'text': text}

    act_titles = [el.get_text().strip() for el in soup.select('td a')]
    act_urls = [el.attrs['href'].lower() for el in soup.select('td a')]

    acts = []
    for i, act_url in enumerate(act_urls):
        site = requests_retry_session().get(root_url + act_url).content
        soup = BeautifulSoup(site, 'html.parser')
        act_content = get_visible_text(soup)
        act_title = act_titles[i]
        acts.append(
            {
                'act_content': act_content,
                'act_url': act_url,
                'act_title': act_title,
                'act_file_type': 'HTML',
            }
        )

    # Check for word or pdf attachments
    if "<a href='" in text:
        # Regex to extract link to document
        docu_urls = re.findall("<a href='(.*)','Dokument", text)
        for docu_url in docu_urls:
            docu_title, docu_raw_data, docu_file_type = parse_zagreb_document_link(docu_url)
            acts.append(
                {
                    'act_content': docu_raw_data,
                    'act_url': docu_url,
                    'act_title': docu_title,
                    'act_file_type': docu_file_type,
                }
            )
    subject_details['acts'] = acts
    return subject_details
Esempio n. 4
0
def parse_subjects_list(url: str) -> tuple:
    site = requests_retry_session().get(url).content
    soup = BeautifulSoup(site, 'html.parser')
    table_items = soup.select('.centralTD ul ol li')
    print(len(table_items), ' elements')
    subjects = []
    for table_item in table_items:
        content = table_item.contents[0]
        if hasattr(content, 'href'):
            link = content.attrs['href'].lower()
            try:
                subject_title = content.select('b')[0].contents[0]
                subjects.append({'subject_title': subject_title, 'subject_url': root_url + link})
            except AttributeError as e:
                print('Error occured while extracting subject title from {}: {}'.format(content, e))
    return subjects, len(table_items)
def extract_months(year: str, url: str) -> list:
    payload = {
        '__Click': '$Refresh',
        'SaveOptions': 0,
        'server': 'szglotweb',
        'trazi': '',
        '% % Surrogate_rb_godina': 1,
        'rb_godina': year,
        '% % Surrogate_rb_sjednice': 1,
    }
    site = requests_retry_session().post(url, data=payload).content
    soup = BeautifulSoup(site, 'html.parser')
    months_soup = soup.find("select", {'name': 'rb_sjednice'}).findAll('option')
    month_list = months_soup[0].text.split('\n')
    # Remove any empty items from month_list
    month_list = list(filter(None, month_list))
    return month_list
def extract_dates(year_range: str, url_suffix: str) -> None:
    url = root_url + url_suffix
    months_on_file = list(ScraperPeriod.objects.all().values_list('period_text', flat=True))
    site = requests_retry_session().get(url).content
    soup = BeautifulSoup(site, 'html.parser')
    years_soup = soup.find("select", {'name': 'rb_godina'}).findAll('option')
    year_list = years_soup[0].text.split('\n')
    for year in year_list:
        month_list = extract_months(year, url)
        for month in month_list:
            if month not in months_on_file:
                start_date, end_date = parse_date_range(month)
                print('Adding {} to {}'.format(month, year_range))
                ScraperPeriod.objects.create(
                    period_text=month,
                    year_range=year_range,
                    start_date=start_date,
                    end_date=end_date,
                )
Esempio n. 7
0
def get_visible_text(soup) -> str:
    # Additional check for pages with JavaScript redirects
    if 'location.replace(' in soup.getText():
        # Regex to extract redirect URL from the 'else' branch of Javascript code
        result = re.search('else\n {4}location.replace[(]"(.*)"[)];', soup.getText())
        act_url = result.group(1)
        site = requests_retry_session().get(root_url + act_url).content
        soup = BeautifulSoup(site, 'html.parser')
        text = get_visible_text(soup)
        return text
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out
    text = soup.getText()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text
Esempio n. 8
0
def scrape_new_periods(dummy='dummy'):
    """
    Searches for any new new periods on the Grad Split website and adds them to the database.
    Note: The dummy input is required for requests_patcher (mock requests) to work.
    """
    print('Searching for any new date ranges to be scraped from Grad Split database...')
    full_url = root_url + '/gradska-uprava/gradonacelnik/akti-gradonacelnika'
    site = requests_retry_session().get(full_url).content
    soup = BeautifulSoup(site, 'html.parser')
    raw_urls = soup.select('.c-documents-list__item-link')

    # Create dictionary of links:titles
    raw_urls_dict = {
        url['href']:                                               # dict key of hrefs
        text_to_date(url.div.get_text(strip=True))                 # dict value of titles in datetime format
        for url in raw_urls
    }
    raw_urls_set = {k for k in raw_urls_dict}

    existing_urls = set(ScraperPeriod.objects.values_list('url', flat=True))
    new_urls = raw_urls_set.difference(existing_urls)

    # create new ScraperPeriod objects if required
    if new_urls:
        for url in new_urls:
            date = raw_urls_dict[url]

            # fix bug for date in 2107
            if date.strftime('%Y') == '2107':
                date = date.replace(year=2017)

            print(f"Found new period: {date.strftime('%d %b %Y')}...")
            ScraperPeriod.objects.create(
                url=url,
                date=date,
                date_text=date.strftime('%d.%b.%Y')
            )
Esempio n. 9
0
def extract_docxfile_data(url_docx: str) -> str:
    response = requests_retry_session().get(url_docx)
    file = io.BytesIO(response.content)
    return docx2txt.process(file)
 def test_function_extracts_correct_data_from_page_with_hidden_redirect(self):
     site = requests_retry_session().get(self.test_url_with_redirect).content
     soup = BeautifulSoup(site, 'html.parser')
     text = scrape_utils_html.get_visible_text(soup)
     self.assertIn('Gradonačelnik Grada Zagreba', text)