Example #1
0
 def __init__(self, File_save):
     self._config = config()['web_sites']['OSHA']
     self._queries = self._config['queries']
     self._url = self._config['url']
     self.file_save = File_save
     self._now = datetime.datetime.now().strftime('%m_%d_%Y')
     self._dir_path = os.path.dirname(os.path.realpath(__file__))
Example #2
0
 def __init__(self, Chemicals, File_save):
     self._config = config()['web_sites']['IFA']
     self._queries = self._config['queries']
     self._url = self._config['url']
     self._existing, self.chemicals = checking_existing_chemicals_in_outfile(File_save, Chemicals)
     self.file_save = File_save
     self._now = datetime.datetime.now().strftime('%m_%d_%Y')
 def __init__(self, investment_site_uid):
     self.site = investment_site_uid
     self._config = config()['investment_sites'][investment_site_uid]
     self._credentials = credentials(
     )['investment_sites'][investment_site_uid]
     self._browser = None
     self._home = "{}".format(self._config['url'])
Example #4
0
 def __init__(self, Chemicals, File_save):
     self._config = config()['web_sites']['NIST']
     self._queries = self._config['queries']
     self._url = self._config['url']
     self._existing, self.chemicals = checking_existing_chemicals_in_outfile(
         File_save, Chemicals)
     self.file_save = File_save
     self._now = datetime.datetime.now().strftime('%m_%d_%Y')
     self._dir_path = os.path.dirname(os.path.realpath(__file__))
Example #5
0
 def __init__(self, headless):
     options = Options()
     if headless:
         options.add_argument('--headless')
         options.add_argument('--no-sandbox')
         options.add_argument('--disable-dev-shm-usage')
     self._browser = webdriver.Chrome(config()['driver']['path'],
                                      chrome_options=options)
     self._browser.implicitly_wait(10)
Example #6
0
 def __init__(self, year, Files):
     self.year = year
     self._dir_path = os.path.dirname(os.path.realpath(__file__)) # Working Directory
     self._config = config()['web_sites']['TRI']
     self._queries = self._config['queries']
     self._url = self._config['url'] # Uniform Resource Locator (URL) of TRI Database
     self._TRI_File_Columns_Dictionary = {} # TRI File Formats
     for File in Files:
         self._TRI_File_Columns_Dictionary[File] = []
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info(f'Beginning scraper for {host}')
    homepage = news.HomePage(news_site_uid, host)
    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)
        if article:
            logger.info('Article fetched!!')
            articles.append(article)
    _save_articles(news_site_uid, articles)
Example #8
0
    def __init__(self, Year):

        # Specification could be found in:
        # https://rcrainfopreprod.epa.gov/rcrainfo-help/application/publicHelp/index.htm
        # Date: 3/17/2020

        # List of tables:
        ### BR_REPORTING_2001
        ### BR_REPORTING_2003
        ### BR_REPORTING_2005
        ### BR_REPORTING_2007
        ### BR_REPORTING_2009
        ### BR_REPORTING_2011
        ### BR_REPORTING_2013
        ### BR_REPORTING_2015
        ### BR_REPORTING_2017

        self._dir_path = os.path.dirname(
            os.path.realpath(__file__))  # Working Directory
        self.Year = Year
        self._config = config()['web_sites']['RCRAInfo']
        self._queries = self._config['queries']
        self._url = self._config[
            'url']  # Uniform Resource Locator (URL) of RCRAInfo Database
 def __init__(self):
     uri = "mongodb+srv://{}:{}@{}/test?retryWrites=true".format(
         user, password, host)
     client = pymongo.MongoClient(uri)
     self._db = client[config()['mongodb']['db']['name']]
import pymongo

from extract.common import credentials
from extract.common import config

host = config()['mongodb']['host']
user = credentials()['mongodb']['user']
password = credentials()['mongodb']['password']


class SaveProjects(object):
    def __init__(self):
        uri = "mongodb+srv://{}:{}@{}/test?retryWrites=true".format(
            user, password, host)
        client = pymongo.MongoClient(uri)
        self._db = client[config()['mongodb']['db']['name']]

    def save(self, projects):
        self._db.project.insert_many(projects)
import yaml
import logging
logging.basicConfig(level=logging.INFO)
import subprocess

from extract.common import config


logger = logging.getLogger(__name__)

#Extrae la lista de nombres del sitio
address='extract/config.yaml'
news_sites_uids=list(config(address)['news_sites'].keys())

def main():
	_create()
	_extract()
	_transform()
	_load()

#Crea el diccionario con los nombres, url y queries de cada sitio de noticias, lo exporta en un archivo .yaml y lo mueve a la carpeta extract
def _create():
	logger.info('Starting create process')
	subprocess.run(['python', 'main.py'], cwd='./create')
	subprocess.run(['mv', 'config.yaml', '../extract/config.yaml'], cwd='./create')
	

#Extrae toda la informacion de sitio de noticias, los exporta por nombre y los mueve a la carpeta transform
def _extract():
	global news_sites_uids
	logger.info('Starting extract process for {}'.format(news_sites_uids))
Example #12
0
 def __init__(self, news_site_uid, url):
     self._config = config()['news_sites'][news_site_uid]
     self._queries = self._config['queries']
     self._html = None
     self._url = url
     self._visit(self._url)
Example #13
0
 def __init__(self):
     self._config = config()['web_sites']['FRS']
     self._dir_path = os.path.dirname(os.path.realpath(__file__)) # Working Directory
    article = None
    try:
        article = news.ArticlePage(news_site_uid, _build_link(host, link))
    except (HTTPError, MaxRetryError, DecodeError, ContentDecodingError,
            TimeoutError, NewConnectionError, ConnectionError):
        logger.warning('Error while fetching the article', exc_info=False)
    if article and not article.body:
        logger.warning('Invalid article. There is no body')
        return None
    return article


def _build_link(host, link):
    if is_well_formed_link.match(link):
        return link
    elif is_root_path.match(link):
        return f'{host}{link}'
    else:
        return '{host}/{uri}'.format(host=host, uri=link)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    news_site_choice = list(config()['news_sites'].keys())
    parser.add_argument('news_site',
                        help='The news site that you want to scraper',
                        type=str,
                        choices=news_site_choice)
    args = parser.parse_args()
    _news_scraper(args.news_site)