def add_collections_to_raw_datasets(graph=GraphWrapper.graph, output_dir=OUTPUT_DIR): """ function writes the collections which have been identified in `graph` to their associated raw dataset json file. This function updates the raw dataset json files by adding a `collection` field to the json structure """ with graph.graph_lock: # select the dataset vertices from the graph try: dataset_vertex_seq = graph.vs.select(is_dataset_eq=True, name_ne='base_vertex', in_collection_ne=None) except: dataset_vertex_seq = [] for dataset in dataset_vertex_seq: # read the raw dataset data = h.read_file(Path(output_dir, dataset['name'])) if not data: continue # update the raw dataset data['collection'] = dataset['in_collection'] # write the updated raw dataset back to file h.write_file(Path(output_dir, dataset['name']), data)
def transform(name=None, input_file=None): """ function is responsible for transforming raw datasets into Collections """ if not name: # user has not provided a scraper name to get collections with logger.error('Scraper/Office name not provided. Cannot generate collections') sys.exit(1) try: # load the Graph representing the deduplicated scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=f'{name}.deduplicate') except: # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=name) # get the loaded graph graph = GraphWrapper.get_graph() # identify collections within the graph identify_collections_within_graph(graph) # link dataset vertices to their appropriate collection(s) within the graph link_datasets_to_collections_in_graph(graph) # write the identified collections to the raw dataset files add_collections_to_raw_datasets(graph=graph, output_dir=OUTPUT_DIR) # write the graph to files # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the page legend file for this graph GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the collections.json file collections_list = [] # holds the list of collections acquired from graph with graph.graph_lock: for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'): collections_list.append({'collection_id': collection['collection_id'], 'collection_title': collection['title'], 'collection_url': collection['name']}) # get a list of non-duplicate collections collections_list = get_distinct_collections_from(collections_list, min_occurence_counter=1) # get the path were the gotten Collections will be saved to on local disk file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json' # write to file the collections gotten from 'name' scraped output h.write_file(file_output_path, collections_list) # write file the collections gotten from 'name' scraped out to S3 bucket h.upload_to_s3_if_configured(file_output_path, f'{(name or "all")}.collections.json')
def add_sources_to_collections_json(name, graph=GraphWrapper.graph, output_dir=OUTPUT_DIR): """ function writes the sources which have been identified in `graph` to their associated collections.json and raw dataset json files. This function updates the json files by adding a `source` field to the `collection` key within the json structure """ # get the collection datajson collections_json = h.read_file( Path(output_dir, 'transformers/collections', f'{name}.collections.json')) with graph.graph_lock: # select all collection vertices within the graph collection_vertex_seq = graph.vs.select(is_collection_eq=True, name_ne='base_vertex') for collection in collection_vertex_seq: # get the list of collections within the collections.json that matches this collection vertex collection_json_list = list( filter(lambda collection_obj, compare_collection_id=collection[ 'collection_id']: collection_obj['collection_id'] == compare_collection_id, collections_json)) # if no collection returned from the datajson, skip this collection vertex if len(collection_json_list) == 0: continue # assign the source info from the collection vertex to the collection json collection_json_list[0]['source'] = collection['in_source'] # write the updated collection datajson back to file h.write_file( Path(output_dir, 'transformers/collections', f'{name}.collections.json'), collections_json) # update the source info for each raw dataset i.e. each dataset json file # select the dataset vertices from the graph try: dataset_vertex_seq = graph.vs.select(is_dataset_eq=True, name_ne='base_vertex', in_collection_ne=None) except: dataset_vertex_seq = [] for dataset_vertex in dataset_vertex_seq: # read the raw dataset data = h.read_file(Path(output_dir, dataset_vertex['name'])) if not data: continue # update the raw dataset collection field & source sub-field data['collection'] = dataset_vertex['in_collection'] # write the updated raw dataset back to file h.write_file(Path(output_dir, dataset_vertex['name']), data)
def transform(name=None, input_file=None): if input_file is None: file_list = h.traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(name) # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # mark as private datasets that have certain keywords in their data data = _mark_private(data, search_words=[ 'conference', 'awards', 'user guide', 'applications' ]) # mark of removal datasets that have certain keywords data = _remove_dataset( data, search_words=['photo', 'foto', 'photos', 'fotos']) # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY # 1. remove 'table [0-9].' from beginning of dataset title data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.', dict_key='title') # set the 'level of data' for the dataset data = _set_dataset_level_of_data(data) # assign the dataset to groups # according to https://www2.ed.gov/rschstat/catalog/index.html data = _set_dataset_groups(data) # remove the old format for collections / sourcs data = _remove_old_sources_collections(data) # write modified dataset back to file h.write_file(file_path, data)
def transform(name=None, input_file=None): """ function is responsible for transofrming raw datasets into Sources """ if input_file is None: # no input file specified file_list = h.traverse_output( name) # run through all the files in 'name' directory else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning( f'Cannot read from list of output files at {input_file}, falling back to all collected data!' ) file_list = h.traverse_output(name) sources_list = [ ] # holds the list of sources acquired from 'name' scraper directory # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # retrieve source from dataset source = extract_source_from(dataset=data, use_key='collection') if not source: # source could not be retrieved continue # add source to list sources_list.append(source) # get a list of non-duplicate Sources sources_list = get_distinct_sources_from(sources_list, min_occurence_counter=2) # get the path were the gotten Sources will be saved to on local disk file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.sources.json' # write to file the Sources gotten from 'name' scraped output h.write_file(file_output_path, sources_list) # write file the Sources gotten from 'name' scraped out to S3 bucket h.upload_to_s3_if_configured(file_output_path, f'{(name or "all")}.sources.json')
def transform(name=None, input_file=None): if input_file is None: file_list = h.traverse_output(name) else: try: with open(input_file, 'r') as fp: file_list = [line.rstrip() for line in fp] except: logger.warning(f'Cannot read from list of output files at {input_file}, falling back to all collected data!') file_list = h.traverse_output(name) # loop through filepath in file list for file_path in file_list: # read the json data in each filepath data = h.read_file(file_path) if not data: # if data is None continue # skip the dataset that has only txt resources if _dataset_only_has_txt_resources(data): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Remove datasets with no resources or no relevant resources if not len(_filter_resources_list(data['resources'])) or not len(data['resources']): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Special hacks for ed.gov data if name == 'edgov': clean_data = {} clean_data['_remove_dataset'] = False # unmark dataset for removal # # Get the publisher name # try: # publisher_name = data['publisher'].get('name') # except: # publisher_name = data['publisher'] # Check for "bad" URLs and remove them bad_subdomains = ['dashboard', 'rems'] if any([f'{bs}.ed.gov' in data['source_url'] for bs in bad_subdomains]): clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # OESE hack. Remove datasets outside oese.ed.gov domain publisher = data.get('publisher') publisher_name = "" if type(publisher) == dict: publisher_name = publisher.get('name') elif type(publisher) == str: publisher_name = publisher if publisher_name in ['oese', 'Office of Elementary and Secondary Education', 'Office of Elementary and Secondary Education (OESE)']: if _dataset_outside_oese_domain(data): clean_data = {} clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Remove duplicate identifiers generated by duplicate URLs in IES/NCES if publisher_name in ['ies', 'Institute of Education Sciences (IES)', 'National Center for Education Statistics (NCES)', 'nces']: if data.get('source_url').endswith('current=yes'): clean_data = data['_clean_data'] clean_data['_remove_dataset'] = True # mark dataset for removal data['_clean_data'] = clean_data # update dataset # Filter resources data = _filter_dataset_resources(data) # mark as private datasets that have certain keywords in their data data = _mark_private(data, search_words=['conference', 'awards', 'user guide', 'applications']) # mark of removal datasets that have certain keywords data = _remove_dataset(data, search_words=['photo', 'foto', 'photos', 'fotos']) # REMOVE UNWANTED STRING FROM THE VALUE OF A DATASET'S KEY # 1. remove 'table [0-9].' from beginning of dataset title data = _strip_unwanted_string(data, r'^table [0-9a-z]+(-?[a-z])?\.', dict_key='title') # set the 'level of data' for the dataset data = _set_dataset_level_of_data(data) # assign the dataset to groups # according to https://www2.ed.gov/rschstat/catalog/index.html data = _set_dataset_groups(data) # remove the old format for collections / sourcs data = _remove_old_sources_collections(data) # write modified dataset back to file h.write_file(file_path, data)