Ejemplo n.º 1
0
    def get_compare_dict(self):
        json_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/compare-statistics.json?alt=media'
        json_s3_file = os.path.join(os.getenv('ED_OUTPUT_PATH'), 'tools',
                                    'stats', 's3_compare-statistics.json')
        json_local_file = os.path.join(os.getenv('ED_OUTPUT_PATH'),
                                       'statistics.json')

        try:
            req = requests.get(json_url)
            req.raise_for_status()
            with open(json_s3_file, 'wb') as json_file:
                json_file.write(req.content)
        except:
            pass

        try:
            result = json.loads(json_s3_file)
        except JSONDecodeError:
            try:
                with open(json_local_file) as json_file:
                    result = json.load(json_file)
            except FileNotFoundError:
                logger.error('Comparison statistics JSON not found!')
                raise

        return result
Ejemplo n.º 2
0
    def __init__(self):

        logger.debug("Creating statistics...")

        if os.path.exists(
                self.METRICS_OUTPUT_XLSX):  # check if excel sheet exist
            os.remove(self.METRICS_OUTPUT_XLSX)  # remove the excel sheet

        try:
            self.datopian_out_df = pd.read_csv(os.path.join(
                os.getenv('ED_OUTPUT_PATH'), 'out_df.csv'),
                                               header=0)
        except Exception as e:
            logger.error(
                'Could not load the Datopian CSV, please generate it first.')
            # read the AIR csv into a dataframe

        try:
            air_csv_url = 'https://storage.googleapis.com/storage/v1/b/us-ed-scraping/o/AIR.csv?alt=media'
            req = requests.get(air_csv_url)
            air_df_path = pathlib.Path(os.getenv('ED_OUTPUT_PATH'), 'tools',
                                       "stats", 'data', 'air_df.csv')
            # make the required path/directories
            pathlib.Path.resolve(air_df_path).parent.mkdir(parents=True,
                                                           exist_ok=True)
            # write the downloded file to disk
            with open(air_df_path, 'wb') as air_df_file:
                air_df_file.write(req.content)

            self.air_out_df = pd.read_csv(air_df_path, header=0)
        except Exception as e:
            logger.error('Could not load the AIR CSV.')
Ejemplo n.º 3
0
def transform(name=None, input_file=None):
    """
    function is responsible for transforming raw datasets into Collections
    """

    if not name: # user has not provided a scraper name to get collections with
        logger.error('Scraper/Office name not provided. Cannot generate collections')
        sys.exit(1)
    try:
        # load the Graph representing the deduplicated scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=f'{name}.deduplicate')
        
    except:
        # load the Graph representing the scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
        
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    # identify collections within the graph
    identify_collections_within_graph(graph)
    # link dataset vertices to their appropriate collection(s) within the graph
    link_datasets_to_collections_in_graph(graph)
    # write the identified collections to the raw dataset files
    add_collections_to_raw_datasets(graph=graph,
                                    output_dir=OUTPUT_DIR)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                        file_stem_name=f'{name}.collections')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                         file_stem_name=f'{name}.collections')                                    

    
    # create the collections.json file                                      
    collections_list = [] # holds the list of collections acquired from graph

    with graph.graph_lock:
        for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'):
            collections_list.append({'collection_id': collection['collection_id'],
                                     'collection_title': collection['title'],
                                      'collection_url': collection['name']})
    
    # get a list of non-duplicate collections
    collections_list = get_distinct_collections_from(collections_list,
                                                     min_occurence_counter=1)
    # get the path were the gotten Collections will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json'
    # write to file the collections gotten from 'name' scraped output
    h.write_file(file_output_path, collections_list)
    # write file the collections gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path, 
                                 f'{(name or "all")}.collections.json')
Ejemplo n.º 4
0
def read_json_file():

    try:
        with open(file_path) as json_file:
            data = json.load(json_file)
    except:
        logger.error('Cannot read statistics.json file!')

    return data
Ejemplo n.º 5
0
def parse(res):
    """ function parses content to create a dataset model
    or return None if no resource in content"""

    # ensure that the response text gotten is a string
    if not isinstance(getattr(res, 'text', None), str):
        return None

    try:
        soup_parser = bs4.BeautifulSoup(res.text, 'html5lib')
    except:
        return None

    # check if the content contains any of the extensions
    if soup_parser.body.find(name='a',
                             href=base_parser.resource_checker,
                             recursive=True) is None:
        # no resource on this page, so return None
        return None

    # if code gets here, at least one resource was found

    # check if the parser is working on OSERS web page
    if soup_parser.body.find(name='div', id='maincontent',
                             recursive=True) is not None:
        # parse the page with the parser and return result
        return parsers.parser1.parse(res)

    # check if the parser is working on OCTAE web page (variant 2)
    if soup_parser.body.select_one('.headersLevel1') is not None:
        # parse the page with the parser and return result
        return parsers.parser2.parse(res)
    else:

        logger.error('Page doesnt fit in any structure:')
        logger.error(res)

        return None
Ejemplo n.º 6
0
def transform(name, input_file=None):
    if input_file is None:
        file_list = traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = traverse_output(name)

    logger.debug(f'{len(file_list)} files to transform.')

    catalog = Catalog()
    catalog.catalog_id = "datopian_data_json_" + (name or 'all')

    # keep track/stata for item transformed
    datasets_number = 0
    resources_number = 0
    sources_number = 0
    collections_number = 0

    # loop through the list of filepaths to be transformed
    for file_path in file_list:

        data = read_file(file_path)
        if not data:
            continue

        dataset = _transform_scraped_dataset(data, name)

        if not dataset:  # no dataset was returned (i.e. dataset probably marked for removal)
            continue

        catalog.datasets.append(dataset)

        datasets_number += 1
        resources_number += len(dataset.distribution)

    # TODO WORK FROM BELOW HERE
    # get the list of Sources for this catalog
    catalog_sources = list()
    try:
        # read the list of preprocessed (but still 'raw') Sources from file
        catalog_sources = read_file(
            f"{h.get_output_path('sources')}/{(name or 'all')}.sources.json")
        # transform the list of preprocessed Sources to a list of Source objects acceptable for the catalog object
        catalog_sources = _transform_preprocessed_sources(catalog_sources)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.sources.json) not found. This datajson output will have no "source" field'
        )

    # add the list of Source objects to the catalog
    catalog.sources = catalog_sources or []
    # update the number fo transformed Sources
    sources_number = len(catalog_sources or [])

    # get the list of Collections for this catalog
    catalog_collections = list()
    try:
        # read the list of preprocessed (but still 'raw') Collections from file
        catalog_collections = read_file(
            f"{h.get_output_path('collections')}/{(name or 'all')}.collections.json"
        )
        # transform the list of preprocessed Collections to a list of Collection objects acceptable for the catalog object
        catalog_collections = _transform_preprocessed_collections(
            catalog_collections)
    except:
        logger.warning(
            f'"sources transformer" output file ({(name or "all")}.collections.json) not found. This datajson output will have no "collection" field'
        )

    # add the list of Collection objects to the catalog
    catalog.collections = catalog_collections or []
    # update the number fo transformed Collections
    collections_number = len(catalog_collections or [])

    # validate the catalog object
    if not catalog.validate_catalog(pls_fix=True):
        logger.error(f"catalog validation Failed! Ending transform process")
        return

    logger.debug('{} Sources transformed.'.format(sources_number))
    logger.debug('{} Collections transformed.'.format(collections_number))
    logger.debug('{} datasets transformed.'.format(datasets_number))
    logger.debug('{} resources transformed.'.format(resources_number))

    output_path = h.get_output_path('datajson')
    file_path = os.path.join(output_path, f'{(name or "all")}.data.json')
    with open(file_path, 'w') as output:
        output.write(catalog.dump())
        logger.debug(f'Output file: {file_path}')

    h.upload_to_s3_if_configured(file_path, f'{(name or "all")}.data.json')
Ejemplo n.º 7
0
import os
import sys
import json

from edscrapers.cli import logger

OUTPUT_PATH = os.getenv('ED_OUTPUT_PATH')
try:
    file_path = os.path.join(OUTPUT_PATH, 'statistics.json')
except TypeError:
    logger.error('ED_OUTPUT_PATH env var not set!')
    sys.exit(1)

def read_json_file():

    try:
        with open(file_path) as json_file:
            data = json.load(json_file)
    except:
        logger.error('Cannot read statistics.json file!')

    return data

def get_stats():
    return read_json_file()

def get_total_datasets_number():
    data = read_json_file()
    return data['total']['datopian']['datasets']

def get_total_datasets_data():