コード例 #1
0
def purge_EIN_duplicates(df, client, collection, dupe_collection):
    found_duplicates = []
    for i in range(len(df)):
        EIN = int(df.loc[i, 'EIN'])
        if prevent_IRS_EIN_duplicates(EIN, client, collection):
            found_duplicates.append(i)
    duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
    print('inserting tmpIRS dupes into the dupe collection')
    insert_services(duplicate_df.to_dict('records'), client, dupe_collection)
    df = df.drop(found_duplicates).reset_index(drop=True)
    return df
コード例 #2
0
def main(client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    print(str(scraped_update_date))
    try:
        stored_update_date = client['data-sources'].find_one(
            {"name": "hud_pit_hic_data"})['last_updated']
        stored_update_date = datetime.strptime(str(stored_update_date),
                                               '%Y-%m-%d %H:%M:%S').date()
        print(stored_update_date)
        print(check_site_for_new_date(stored_update_date))
        if not check_site_for_new_date(stored_update_date):
            print('No new update detected. Exiting script...')
            return
    except KeyError:
        print('Key Error')
        pass
    df = grab_data()
    print('purging duplicates from existing HUD collection')
    if client[dump_collection].estimated_document_count() > 0:
        df = df  # purge_HUD_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        print('refreshing ngrams')
        # refresh_ngrams(client, check_collection)
        found_duplicates = []
        print('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'Project Name'],
                                            df.loc[i, 'zip'], client,
                                            check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'Project Name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        print(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        print(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)
        print('updating scraped update date in data-sources collection')
        try:
            client['data_sources'].update_one(
                {"name": "irs_exempt_organizations"}, {
                    '$set': {
                        'last_updated':
                        datetime.strftime(scraped_update_date, '%m/%d/%Y')
                    }
                })
        except errors.OperationFailure as e:
            print(e)
コード例 #3
0
def main_scraper(client: MongoClient, scraper_config: ScraperConfig):
    """Base function for ingesting raw data, preparing it and depositing it in MongoDB

    Args:
        client (MongoClient): connection to the MongoDB instance
        scraper_config (ScraperConfig): instance of the ScraperConfig class
    """
    df = grab_data(scraper_config)
    if client[scraper_config.dump_collection].estimated_document_count() > 0:
        print('purging duplicates from existing CareerOneStop collection')
        df = scraper_config.purge_collection_duplicates(df)
    if client[scraper_config.check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client,
                        scraper_config.dump_collection)
    else:
        print('refreshing ngrams')
        # refresh_ngrams(client, check_collection)
        found_duplicates = []
        print('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'],
                                            df.loc[i, 'zip'], client,
                                            scraper_config.check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        print(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            scraper_config.dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        print(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client,
                            scraper_config.dump_collection)
        print('updating scraped update date in data-sources collection')
        try:
            client['data-sources'].update_one(
                {"name": scraper_config.data_source_collection_name}, {
                    '$set': {
                        'last_scraped':
                        datetime.strftime(datetime.now(), '%m/%d/%Y')
                    }
                },
                upsert=True)
        except errors.OperationFailure as e:
            print(e)
コード例 #4
0
def main(config, client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    try:
        stored_update_date = client['data-sources'].find_one(
            {"name": "irs_exempt_organizations"})['last_updated']
        stored_update_date = datetime.strptime(str(stored_update_date),
                                               '%Y-%m-%d %H:%M:%S').date()
        if check_site_for_new_date(stored_update_date):
            print('No new update detected. Exiting script...')
            return
    except KeyError:
        pass
    print('updating scraped update date in data-sources collection')
    client['data_sources'].update_one(
        {"name": "irs_exempt_organizations"},
        {'$set': {
            'last_updated': str(scraped_update_date)
        }})
    code_dict = config['NTEE_codes']
    df = grab_data(config, code_dict)
    print('purging EIN duplicates')
    if client[dump_collection].estimated_document_count() > 0:
        df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        print('refreshing ngrams')
        refresh_ngrams(client, check_collection)
        found_duplicates = []
        print('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i,
                                                                      'zip'],
                                            client, check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        print(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        print(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)