def purge_EIN_duplicates(df, client, collection, dupe_collection): found_duplicates = [] for i in range(len(df)): EIN = int(df.loc[i, 'EIN']) if prevent_IRS_EIN_duplicates(EIN, client, collection): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) print('inserting tmpIRS dupes into the dupe collection') insert_services(duplicate_df.to_dict('records'), client, dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) return df
def main(client, check_collection, dump_collection, dupe_collection): scraped_update_date = scrape_updated_date() print(str(scraped_update_date)) try: stored_update_date = client['data-sources'].find_one( {"name": "hud_pit_hic_data"})['last_updated'] stored_update_date = datetime.strptime(str(stored_update_date), '%Y-%m-%d %H:%M:%S').date() print(stored_update_date) print(check_site_for_new_date(stored_update_date)) if not check_site_for_new_date(stored_update_date): print('No new update detected. Exiting script...') return except KeyError: print('Key Error') pass df = grab_data() print('purging duplicates from existing HUD collection') if client[dump_collection].estimated_document_count() > 0: df = df # purge_HUD_duplicates(df, client, dump_collection, dupe_collection) if client[check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, dump_collection) else: print('refreshing ngrams') # refresh_ngrams(client, check_collection) found_duplicates = [] print('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'Project Name'], df.loc[i, 'zip'], client, check_collection) if dc is not False: if check_similarity(df.loc[i, 'Project Name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) print( f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection' ) if len(duplicate_df) > 0: insert_services(duplicate_df.to_dict('records'), client, dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) print(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, dump_collection) print('updating scraped update date in data-sources collection') try: client['data_sources'].update_one( {"name": "irs_exempt_organizations"}, { '$set': { 'last_updated': datetime.strftime(scraped_update_date, '%m/%d/%Y') } }) except errors.OperationFailure as e: print(e)
def main_scraper(client: MongoClient, scraper_config: ScraperConfig): """Base function for ingesting raw data, preparing it and depositing it in MongoDB Args: client (MongoClient): connection to the MongoDB instance scraper_config (ScraperConfig): instance of the ScraperConfig class """ df = grab_data(scraper_config) if client[scraper_config.dump_collection].estimated_document_count() > 0: print('purging duplicates from existing CareerOneStop collection') df = scraper_config.purge_collection_duplicates(df) if client[scraper_config.check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, scraper_config.dump_collection) else: print('refreshing ngrams') # refresh_ngrams(client, check_collection) found_duplicates = [] print('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, scraper_config.check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) print( f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection' ) if len(duplicate_df) > 0: insert_services(duplicate_df.to_dict('records'), client, scraper_config.dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) print(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, scraper_config.dump_collection) print('updating scraped update date in data-sources collection') try: client['data-sources'].update_one( {"name": scraper_config.data_source_collection_name}, { '$set': { 'last_scraped': datetime.strftime(datetime.now(), '%m/%d/%Y') } }, upsert=True) except errors.OperationFailure as e: print(e)
def main(config, client, check_collection, dump_collection, dupe_collection): scraped_update_date = scrape_updated_date() try: stored_update_date = client['data-sources'].find_one( {"name": "irs_exempt_organizations"})['last_updated'] stored_update_date = datetime.strptime(str(stored_update_date), '%Y-%m-%d %H:%M:%S').date() if check_site_for_new_date(stored_update_date): print('No new update detected. Exiting script...') return except KeyError: pass print('updating scraped update date in data-sources collection') client['data_sources'].update_one( {"name": "irs_exempt_organizations"}, {'$set': { 'last_updated': str(scraped_update_date) }}) code_dict = config['NTEE_codes'] df = grab_data(config, code_dict) print('purging EIN duplicates') if client[dump_collection].estimated_document_count() > 0: df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection) if client[check_collection].estimated_document_count() == 0: # No need to check for duplicates in an empty collection insert_services(df.to_dict('records'), client, dump_collection) else: print('refreshing ngrams') refresh_ngrams(client, check_collection) found_duplicates = [] print('checking for duplicates in the services collection') for i in tqdm(range(len(df))): dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i, 'zip'], client, check_collection) if dc is not False: if check_similarity(df.loc[i, 'name'], dc): found_duplicates.append(i) duplicate_df = df.loc[found_duplicates].reset_index(drop=True) print( f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection' ) if len(duplicate_df) > 0: insert_services(duplicate_df.to_dict('records'), client, dupe_collection) df = df.drop(found_duplicates).reset_index(drop=True) print(f'final df shape: {df.shape}') if len(df) > 0: insert_services(df.to_dict('records'), client, dump_collection)