Exemple #1
0
def download_xml(link, verify_link=config.get_download_verify_link()):
    try:
        page = requests.get(link, verify=verify_link)
    except requests.exceptions.ConnectionError as e:
        logging.warning('Unable to open connection.')
        return None
    return page.content if page.status_code == 200 else None
Exemple #2
0
def download_content(link,
                     verify_link=config.get_download_verify_link(),
                     post_parm=None,
                     headers=None,
                     encoding='utf-8'):
    try:
        if post_parm is None:
            logging.debug('Downloading without post parameters.')
            page = requests.get(link, verify=verify_link, headers=headers)
            page.encoding = encoding
        else:
            logging.debug('Downloading with post parameters.')
            headers_static = {
                "Content-Type":
                "application/x-www-form-urlencoded; charset=UTF-8"
            }
            if headers is not None:
                headers.update(headers_static)
            else:
                headers = headers_static
            page = requests.post(link,
                                 verify=verify_link,
                                 data=post_parm,
                                 headers=headers)
            page.encoding = encoding
    except requests.exceptions.ConnectionError as e:
        logging.warning('Unable to open connection. (%s)', e)
        return None
    return page.text if page.status_code == 200 else None
Exemple #3
0
def save_downloaded_soup(link,
                         file,
                         filetype,
                         post_data=None,
                         verify=config.get_download_verify_link(),
                         headers=None):
    if config.get_download_use_cached_data() is True and os.path.isfile(file):
        soup = readfile(file, filetype)
    else:
        if link is not None:
            soup = download_content(link, verify, post_data, headers)
            if soup is not None:
                logging.info('We got content, write to file.')
                if not os.path.exists(config.get_directory_cache_url()):
                    os.makedirs(config.get_directory_cache_url())
                with open(file, mode='w', encoding='utf-8') as code:
                    if filetype == FileType.html:
                        soup = BeautifulSoup(soup, 'html.parser')
                        code.write(str(soup.prettify()))
                    elif filetype == FileType.xml:
                        soup = BeautifulSoup(soup,
                                             'lxml',
                                             from_encoding='utf-8')
                        logging.debug('original encoding: %s',
                                      soup.original_encoding)
                        code.write(str(soup.prettify()))
                    elif filetype == FileType.csv or filetype == FileType.json:
                        code.write(str(soup))
                    else:
                        logging.error('Unexpected type to write: %s', filetype)
            else:
                if os.path.exists(file):
                    logging.info(
                        'The %s link returned error code other than 200 but there is an already downloaded file. Try to open it.',
                        link)
                    soup = readfile(file, filetype)
                else:
                    logging.warning(
                        'Skipping dataset: %s. There is not downloadable URL, nor already downbloaded file.',
                        link)
        else:
            if os.path.exists(file):
                soup = readfile(file, filetype)
                if filetype == FileType.html:
                    soup = BeautifulSoup(soup, 'html.parser')
                elif filetype == FileType.xml:
                    soup = BeautifulSoup(soup, 'lxml')
                logging.info(
                    'Using file only: %s. There is not downloadable URL only just the file. Do not forget to update file manually!',
                    file)
            else:
                logging.warning(
                    'Cannot use download and file: %s. There is not downloadable URL, nor already downbloaded file.',
                    file)
    return soup
Exemple #4
0
def save_downloaded_pd(link, file, verify=config.get_download_verify_link()):
    if config.get_download_use_cached_data() == True and os.path.isfile(file):
        df = pd.read_csv(file)
    else:
        df = pd.read_csv(link, encoding='UTF-16', sep='\t')
        if df is not None:
            if not os.path.exists(config.get_directory_cache_url()):
                os.makedirs(config.get_directory_cache_url())
            df.to_csv(file)
        else:
            logging.warning('Skipping dataset.')
    return df
Exemple #5
0
def save_downloaded_xml(link, file, verify=config.get_download_verify_link()):
    if config.get_download_use_cached_data() == True and os.path.isfile(file):
        with open(file, 'rb') as content_file:
            page = content_file.read()
    else:
        page = download_xml(link, verify)
        if page != None:
            if not os.path.exists(config.get_directory_cache_url()):
                os.makedirs(config.get_directory_cache_url())
            with open(file, mode='wb') as code:
                code.write(page)
        else:
            logging.warning('Skipping dataset.')
    return page
Exemple #6
0
def save_downloaded_soup(link,
                         file,
                         post_data=None,
                         verify=config.get_download_verify_link()):
    if config.get_download_use_cached_data() == True and os.path.isfile(file):
        with open(file, 'r') as content_file:
            soup = BeautifulSoup(content_file.read(), 'html.parser')
    else:
        soup = download_soup(link, verify, post_data)
        if soup != None:
            if not os.path.exists(config.get_directory_cache_url()):
                os.makedirs(config.get_directory_cache_url())
            with open(file, mode="w", encoding="utf8") as code:
                code.write(str(soup))
        else:
            logging.warning('Skipping dataset: {}.'.format(link))
    return soup
Exemple #7
0
def save_downloaded_pd(link,
                       file,
                       verify=config.get_download_verify_link(),
                       headers=None):
    if config.get_download_use_cached_data() is True and os.path.isfile(file):
        df = pd.read_csv(file)
    else:
        if link is not None:
            cvs = download_content(link, verify, None, None, 'utf-16')
            if cvs is not None:
                logging.info('We got content, write to file.')
                if not os.path.exists(config.get_directory_cache_url()):
                    os.makedirs(config.get_directory_cache_url())
                with open(file, mode='w', encoding='utf-8') as code:
                    code.write(cvs)
                df = pd.read_csv(file, encoding='UTF-8', sep='\t', skiprows=0)
            else:
                if os.path.exists(file):
                    logging.info(
                        'The %s link returned error code other than 200 but there is an already downloaded file. Try to open it.',
                        link)
                    df = pd.read_csv(file,
                                     encoding='UTF-8',
                                     sep='\t',
                                     skiprows=0)
                else:
                    logging.warning(
                        'Skipping dataset: %s. There is not downloadable URL, nor already downbloaded file.',
                        link)
        else:
            if os.path.exists(file):
                df = pd.read_csv(file, encoding='UTF-8', sep='\t', skiprows=0)
                logging.info(
                    'Using file only: %s. There is not downloadable URL only just the file. Do not forget to update file manually!',
                    file)
            else:
                logging.warning(
                    'Cannot use download and file: %s. There is not downloadable URL, nor already downbloaded file.',
                    file)
    return df
Exemple #8
0
def download_soup(link,
                  verify_link=config.get_download_verify_link(),
                  post_parm=None):
    try:
        if post_parm is None:
            logging.debug('Downloading without post parameters.')
            page = requests.get(link, verify=verify_link)
        else:
            logging.debug('Downloading with post parameters.')
            headers = {
                "Content-Type":
                "application/x-www-form-urlencoded; charset=UTF-8"
            }
            page = requests.post(link,
                                 verify=verify_link,
                                 data=post_parm,
                                 headers=headers)
    except requests.exceptions.ConnectionError as e:
        logging.warning('Unable to open connection.')
        return None
    return BeautifulSoup(page.content,
                         'html.parser') if page.status_code == 200 else None