Beispiel #1
0
def add_collections_to_raw_datasets(graph=GraphWrapper.graph, 
                                    output_dir=OUTPUT_DIR):
    """ function writes the collections which have been identified in `graph`
    to their associated raw dataset json file. 
    This function updates the raw dataset json files by
    adding a `collection` field to the json structure """
    
    with graph.graph_lock:                         
        # select the dataset vertices from the graph
        try:
            dataset_vertex_seq = graph.vs.select(is_dataset_eq=True, 
                                             name_ne='base_vertex',
                                             in_collection_ne=None)
        except:
            dataset_vertex_seq = []
                                 
        for dataset in dataset_vertex_seq:
            # read the raw dataset
            data = h.read_file(Path(output_dir, dataset['name']))
            if not data:
                continue
            # update the raw dataset
            data['collection'] = dataset['in_collection']
            # write the updated raw dataset back to file
            h.write_file(Path(output_dir, dataset['name']), data)
Beispiel #2
0
def transform(name=None, input_file=None):
    """
    function is responsible for transforming raw datasets into Collections
    """

    if not name: # user has not provided a scraper name to get collections with
        logger.error('Scraper/Office name not provided. Cannot generate collections')
        sys.exit(1)
    try:
        # load the Graph representing the deduplicated scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=f'{name}.deduplicate')
        
    except:
        # load the Graph representing the scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
        
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    # identify collections within the graph
    identify_collections_within_graph(graph)
    # link dataset vertices to their appropriate collection(s) within the graph
    link_datasets_to_collections_in_graph(graph)
    # write the identified collections to the raw dataset files
    add_collections_to_raw_datasets(graph=graph,
                                    output_dir=OUTPUT_DIR)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                        file_stem_name=f'{name}.collections')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                         file_stem_name=f'{name}.collections')                                    

    
    # create the collections.json file                                      
    collections_list = [] # holds the list of collections acquired from graph

    with graph.graph_lock:
        for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'):
            collections_list.append({'collection_id': collection['collection_id'],
                                     'collection_title': collection['title'],
                                      'collection_url': collection['name']})
    
    # get a list of non-duplicate collections
    collections_list = get_distinct_collections_from(collections_list,
                                                     min_occurence_counter=1)
    # get the path were the gotten Collections will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json'
    # write to file the collections gotten from 'name' scraped output
    h.write_file(file_output_path, collections_list)
    # write file the collections gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path, 
                                 f'{(name or "all")}.collections.json')
Beispiel #3
0
def add_sources_to_collections_json(name,
                                    graph=GraphWrapper.graph,
                                    output_dir=OUTPUT_DIR):
    """ function writes the sources which have been identified in `graph`
    to their associated collections.json and raw dataset json files. 
    This function updates the json files by
    adding a `source` field to the `collection` key within the json structure """

    # get the collection datajson
    collections_json = h.read_file(
        Path(output_dir, 'transformers/collections',
             f'{name}.collections.json'))

    with graph.graph_lock:
        # select all collection vertices within the graph
        collection_vertex_seq = graph.vs.select(is_collection_eq=True,
                                                name_ne='base_vertex')
        for collection in collection_vertex_seq:
            # get the list of collections within the collections.json that matches this collection vertex
            collection_json_list = list(
                filter(lambda collection_obj, compare_collection_id=collection[
                    'collection_id']: collection_obj['collection_id'] ==
                       compare_collection_id,
                       collections_json))
            # if no collection returned from the datajson, skip this collection vertex
            if len(collection_json_list) == 0:
                continue
            # assign the source info from the collection vertex to the collection json
            collection_json_list[0]['source'] = collection['in_source']
        # write the updated collection datajson back to file
        h.write_file(
            Path(output_dir, 'transformers/collections',
                 f'{name}.collections.json'), collections_json)

        # update the source info for each raw dataset i.e. each dataset json file
        # select the dataset vertices from the graph
        try:
            dataset_vertex_seq = graph.vs.select(is_dataset_eq=True,
                                                 name_ne='base_vertex',
                                                 in_collection_ne=None)
        except:
            dataset_vertex_seq = []

        for dataset_vertex in dataset_vertex_seq:
            # read the raw dataset
            data = h.read_file(Path(output_dir, dataset_vertex['name']))
            if not data:
                continue
            # update the raw dataset collection field & source sub-field
            data['collection'] = dataset_vertex['in_collection']
            # write the updated raw dataset back to file
            h.write_file(Path(output_dir, dataset_vertex['name']), data)
def transform(name=None, input_file=None):

    if input_file is None:
        file_list = h.traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = h.traverse_output(name)

    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue
        # mark as private datasets that have certain keywords in their data
        data = _mark_private(data,
                             search_words=[
                                 'conference', 'awards', 'user guide',
                                 'applications'
                             ])

        # mark of removal datasets that have certain keywords
        data = _remove_dataset(
            data, search_words=['photo', 'foto', 'photos', 'fotos'])

        # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY
        # 1. remove 'table [0-9].' from beginning of dataset title
        data = _strip_unwanted_string(data,
                                      r'^table [0-9a-z]+(-?[a-z])?\.',
                                      dict_key='title')

        # set the 'level of data' for the dataset
        data = _set_dataset_level_of_data(data)

        # assign the dataset to groups
        # according to https://www2.ed.gov/rschstat/catalog/index.html
        data = _set_dataset_groups(data)

        # remove the old format for collections / sourcs
        data = _remove_old_sources_collections(data)

        # write modified dataset back to file
        h.write_file(file_path, data)
Beispiel #5
0
def transform(name=None, input_file=None):
    """
    function is responsible for transofrming raw datasets into Sources
    """

    if input_file is None:  # no input file specified
        file_list = h.traverse_output(
            name)  # run through all the files in 'name' directory
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(
                f'Cannot read from list of output files at {input_file}, falling back to all collected data!'
            )
            file_list = h.traverse_output(name)

    sources_list = [
    ]  # holds the list of sources acquired from 'name' scraper directory
    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue

        # retrieve source from dataset
        source = extract_source_from(dataset=data, use_key='collection')
        if not source:  # source could not be retrieved
            continue
        # add source to list
        sources_list.append(source)

    # get a list of non-duplicate Sources
    sources_list = get_distinct_sources_from(sources_list,
                                             min_occurence_counter=2)
    # get the path were the gotten Sources will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.sources.json'
    # write to file the Sources gotten from 'name' scraped output
    h.write_file(file_output_path, sources_list)
    # write file the Sources gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path,
                                 f'{(name or "all")}.sources.json')
Beispiel #6
0
def transform(name=None, input_file=None):

    if input_file is None:
        file_list = h.traverse_output(name)
    else:
        try:
            with open(input_file, 'r') as fp:
                file_list = [line.rstrip() for line in fp]
        except:
            logger.warning(f'Cannot read from list of output files at {input_file}, falling back to all collected data!')
            file_list = h.traverse_output(name)
    
    # loop through filepath in file list
    for file_path in file_list:
        # read the json data in each filepath
        data = h.read_file(file_path)
        if not data:  # if data is None
            continue

        # skip the dataset that has only txt resources
        if _dataset_only_has_txt_resources(data):
            clean_data = {}
            clean_data['_remove_dataset'] = True # mark dataset for removal
            data['_clean_data'] = clean_data # update dataset

        # Remove datasets with no resources or no relevant resources
        if not len(_filter_resources_list(data['resources'])) or not len(data['resources']):
            clean_data = {}
            clean_data['_remove_dataset'] = True # mark dataset for removal
            data['_clean_data'] = clean_data # update dataset

        # Special hacks for ed.gov data
        if name == 'edgov':
            clean_data = {}
            clean_data['_remove_dataset'] = False # unmark dataset for removal

            # # Get the publisher name
            # try:
            #     publisher_name = data['publisher'].get('name')
            # except:
            #     publisher_name = data['publisher']

            # Check for "bad" URLs and remove them
            bad_subdomains = ['dashboard', 'rems']
            if any([f'{bs}.ed.gov' in data['source_url'] for bs in bad_subdomains]):
                clean_data['_remove_dataset'] = True # mark dataset for removal

            data['_clean_data'] = clean_data # update dataset

        # OESE hack. Remove datasets outside oese.ed.gov domain
        publisher = data.get('publisher')
        publisher_name = ""

        if type(publisher) == dict:
            publisher_name = publisher.get('name')
        elif type(publisher) == str:
            publisher_name = publisher

        if  publisher_name in ['oese',
                    'Office of Elementary and Secondary Education',
                    'Office of Elementary and Secondary Education (OESE)']:
            if _dataset_outside_oese_domain(data):
                clean_data = {}
                clean_data['_remove_dataset'] = True # mark dataset for removal
                data['_clean_data'] = clean_data # update dataset

        # Remove duplicate identifiers generated by duplicate URLs in IES/NCES
        if  publisher_name in ['ies',
                               'Institute of Education Sciences (IES)',
                               'National Center for Education Statistics (NCES)',
                               'nces']:
            if data.get('source_url').endswith('current=yes'):
                clean_data = data['_clean_data']
                clean_data['_remove_dataset'] = True # mark dataset for removal
                data['_clean_data'] = clean_data # update dataset

        # Filter resources
        data = _filter_dataset_resources(data)

        # mark as private datasets that have certain keywords in their data
        data = _mark_private(data, search_words=['conference', 'awards',
                                                 'user guide', 'applications'])

        # mark of removal datasets that have certain keywords
        data = _remove_dataset(data, search_words=['photo', 'foto', 'photos', 'fotos'])

        # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY
        # 1. remove 'table [0-9].' from beginning of dataset title
        data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.',
                                      dict_key='title')

        # set the 'level of data' for the dataset
        data = _set_dataset_level_of_data(data)

        # assign the dataset to groups
        # according to https://www2.ed.gov/rschstat/catalog/index.html
        data = _set_dataset_groups(data)
      
        # remove the old format for collections / sourcs
        data = _remove_old_sources_collections(data)
        
        # write modified dataset back to file
        h.write_file(file_path, data)