Esempio n. 1
0
def purge_EIN_duplicates(df, client, collection, dupe_collection):
    found_duplicates = []
    for i in range(len(df)):
        EIN = int(df.loc[i, 'EIN'])
        if prevent_IRS_EIN_duplicates(EIN, client, collection):
            found_duplicates.append(i)
    duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
    logger.info('inserting tmpIRS dupes into the dupe collection')
    insert_services(duplicate_df.to_dict('records'), client, dupe_collection)
    df = df.drop(found_duplicates).reset_index(drop=True)
    return df
Esempio n. 2
0
def test_fuzzy_match_with_st_saint_discrepancy(
        example_IRS_service_data,
        example_IRS_search_object_with_spelled_out_saint, mock_mongo_client):
    insert_services(example_IRS_service_data, mock_mongo_client.shelter,
                    'tmpIRS')
    refresh_ngrams(mock_mongo_client.shelter, 'tmpIRS')
    name = example_IRS_search_object_with_spelled_out_saint['name']
    zip_code = example_IRS_search_object_with_spelled_out_saint['zip']
    with pytest.raises(NotImplementedError):
        locate_potential_duplicate(
            name, zip_code, mock_mongo_client.shelter, 'tmpIRS'
        )  # Ensure this throws error, b/c if not, we can stop using the real connection below
Esempio n. 3
0
def test_fuzzy_match(example_IRS_service_data,
                     example_IRS_search_object_with_spelled_out_saint,
                     mock_config_object):
    client = get_mongo_client()
    if 'pytest_fuzzy_test' in client.list_collection_names():
        client.drop_collection('pytest_fuzzy_test')
    client.create_collection('pytest_fuzzy_test')
    insert_services(example_IRS_service_data, client, 'pytest_fuzzy_test')
    refresh_ngrams(client, 'pytest_fuzzy_test')
    name = example_IRS_search_object_with_spelled_out_saint['name']
    zip_code = example_IRS_search_object_with_spelled_out_saint['zip']
    dc = locate_potential_duplicate(name, zip_code, client,
                                    'pytest_fuzzy_test')
    client.drop_collection('pytest_fuzzy_test')
    assert dc == 'ST FERIOLE ISLAND PARK'
Esempio n. 4
0
def test_fuzzy_match(example_IRS_service_data,
                     example_IRS_search_object_with_spelled_out_saint,
                     mock_config_object):
    client = MongoClient(
        "mongodb+srv://" + os.environ.get('DBUSERNAME') + ":" +
        os.environ.get('PW') +
        "@shelter-rm3lc.azure.mongodb.net/shelter?retryWrites=true&w=majority"
    )['shelter']
    if 'pytest_fuzzy_test' in client.list_collection_names():
        client.drop_collection('pytest_fuzzy_test')
    client.create_collection('pytest_fuzzy_test')
    insert_services(example_IRS_service_data, client, 'pytest_fuzzy_test')
    refresh_ngrams(client, 'pytest_fuzzy_test')
    name = example_IRS_search_object_with_spelled_out_saint['name']
    zip_code = example_IRS_search_object_with_spelled_out_saint['zip']
    dc = locate_potential_duplicate(name, zip_code, client,
                                    'pytest_fuzzy_test')
    client.drop_collection('pytest_fuzzy_test')
    assert dc == 'ST FERIOLE ISLAND PARK'
Esempio n. 5
0
    def purge_collection_duplicates(self, df: pd.DataFrame,
                                    client: MongoClient) -> pd.DataFrame:
        """Function to check the pre-processed data and
        delete exact dupes that already exist in the tmp collection

        Args:
            df (pd.DataFrame): pre-processed data from grab_data()
            client (MongoClient): MongoDB connection instance

        Returns:
            pd.DataFrame: DataFrame free of exact duplicates
        """
        found_duplicates = []
        coll = client[self.dump_collection]
        for i in tqdm(range(len(df))):
            idx = df.loc[i, self.collection_dupe_field]
            dupe = coll.find_one({self.collection_dupe_field: idx})
            if dupe is not None:
                found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        insert_services(duplicate_df.to_dict('records'), client,
                        self.dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        return df
Esempio n. 6
0
def main(config, client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    try:
        stored_update_date = client['data-sources'].find_one(
            {"name": "irs_exempt_organizations"})['last_updated']
        stored_update_date = datetime.strptime(str(stored_update_date),
                                               '%Y-%m-%d %H:%M:%S').date()
        if check_site_for_new_date(stored_update_date):
            logging.info('No new update detected. Exiting script...')
            return
    except KeyError:
        pass
    logging.info('updating scraped update date in data-sources collection')
    client['data_sources'].update_one(
        {"name": "irs_exempt_organizations"},
        {'$set': {
            'last_updated': str(scraped_update_date)
        }})
    code_dict = config['NTEE_codes']
    df = grab_data(config, code_dict)
    logging.info('purging EIN duplicates')
    if client[dump_collection].estimated_document_count() > 0:
        df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        logging.info('refreshing ngrams')
        refresh_ngrams(client, check_collection)
        found_duplicates = []
        logging.info('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i,
                                                                      'zip'],
                                            client, check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        logging.info(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        logging.info(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)
Esempio n. 7
0
    def main_scraper(self, client: MongoClient) -> None:
        """Base function for ingesting raw data, preparing it and depositing it in MongoDB

        Args:
            client (MongoClient): connection to the MongoDB instance
            scraper_config (ScraperConfig): instance of the ScraperConfig class
        """
        df = self.grab_data()
        if client[self.dump_collection].estimated_document_count() > 0:
            logging.info(
                f'purging duplicates from existing {self.source} collection')
            df = self.purge_collection_duplicates(df, client)
        if client[self.check_collection].estimated_document_count() == 0:
            # No need to check for duplicates in an empty collection
            insert_services(df.to_dict('records'), client,
                            self.dump_collection)
        else:
            logging.info('refreshing ngrams')
            refresh_ngrams(client, self.check_collection)
            found_duplicates = []
            logging.info('checking for duplicates in the services collection')
            for i in tqdm(range(len(df))):
                dc = locate_potential_duplicate(df.loc[i, 'name'],
                                                df.loc[i, 'zip'], client,
                                                self.check_collection)
                if dc is not False:
                    if check_similarity(df.loc[i, 'name'], dc):
                        found_duplicates.append(i)
            duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
            if len(duplicate_df) > 0:
                logging.info(
                    f'inserting services dupes into the {self.source} dupe collection'
                )
                insert_services(duplicate_df.to_dict('records'), client,
                                self.dupe_collection)
            df = df.drop(found_duplicates).reset_index(drop=True)
            logging.info(f'final df shape: {df.shape}')
            if len(df) > 0:
                insert_services(df.to_dict('records'), client,
                                self.dump_collection)
                logging.info(
                    'updating scraped update date in data-sources collection')
                client['data_sources'].update_one(
                    {"name": self.data_source_collection_name}, {
                        '$set': {
                            'last_updated':
                            datetime.strftime(datetime.now(), '%m/%d/%Y')
                        }
                    })
Esempio n. 8
0
def main(config, client, check_collection, dump_collection, dupe_collection):
    scraped_update_date = scrape_updated_date()
    try:
        stored_update_date = retrieve_last_scraped_date(date)
        if stored_update_date and scraped_update_date <= stored_update_date:
            logger.info('No new update detected. Exiting script...')
            return
    except KeyError:
        pass
    logger.info('updating last scraped date in data-sources collection')
    client['data-sources'].update_one({"name": "irs"}, {
        '$set': {
            'last_scraped':
            datetime.now(timezone('UTC')).replace(microsecond=0).isoformat()
        }
    },
                                      upsert=True)
    code_dict = config['NTEE_codes']
    df = grab_data(config, code_dict)
    logger.info('purging EIN duplicates')
    if client[dump_collection].estimated_document_count() > 0:
        df = purge_EIN_duplicates(df, client, dump_collection, dupe_collection)
    if client[check_collection].estimated_document_count() == 0:
        # No need to check for duplicates in an empty collection
        insert_services(df.to_dict('records'), client, dump_collection)
    else:
        logger.info('refreshing ngrams')
        refresh_ngrams(client, check_collection)
        found_duplicates = []
        logger.info('checking for duplicates in the services collection')
        for i in tqdm(range(len(df))):
            dc = locate_potential_duplicate(df.loc[i, 'name'], df.loc[i,
                                                                      'zip'],
                                            client, check_collection)
            if dc is not False:
                if check_similarity(df.loc[i, 'name'], dc):
                    found_duplicates.append(i)
        duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
        logger.info(
            f'inserting {duplicate_df.shape[0]} services dupes into the dupe collection'
        )
        if len(duplicate_df) > 0:
            insert_services(duplicate_df.to_dict('records'), client,
                            dupe_collection)
        df = df.drop(found_duplicates).reset_index(drop=True)
        logger.info(f'final df shape: {df.shape}')
        if len(df) > 0:
            insert_services(df.to_dict('records'), client, dump_collection)
Esempio n. 9
0
def test_insert_services(example_IRS_service_data, mock_mongo_client):
    insert_services(example_IRS_service_data, mock_mongo_client.shelter,
                    'tmpIRS')
    found_services = [obj for obj in mock_mongo_client.shelter.tmpIRS.find()]
    assert example_IRS_service_data == found_services
Esempio n. 10
0
def test_retrieve_last_scraped_date(
    mock_mongo_client, example_data_source_collection, base_scraper_fixture
):
    insert_services(example_data_source_collection, mock_mongo_client.shelter, 'data-sources')
    stored_date = base_scraper_fixture.retrieve_last_scraped_date(mock_mongo_client.shelter)
    assert type(stored_date) == datetime.date
Esempio n. 11
0
    def main_scraper(self, client: MongoClient) -> None:
        """Base function for ingesting raw data, preparing it and depositing it in MongoDB

        Args:
            client (MongoClient): connection to the MongoDB instance
            scraper_config (ScraperConfig): instance of the ScraperConfig class
        """
        if not self.is_new_data_available(client):
            logger.info('No new data. Goodbye...')
            return

        df = self.grab_data()

        if client[self.dump_collection].estimated_document_count() > 0:
            logger.info(
                f'purging duplicates from existing {self.source} collection')
            df = self.purge_collection_duplicates(df, client)

        if self.groupby_columns is not None:
            df = self.aggregate_service_summary(df)

        if client[self.check_collection].estimated_document_count() == 0:
            # No need to check for duplicates in an empty collection
            insert_services(df.to_dict('records'), client,
                            self.dump_collection)
        else:
            logger.info('refreshing ngrams')
            refresh_ngrams(client, self.check_collection)
            found_duplicates = []
            logger.info('checking for duplicates in the services collection')
            for i in tqdm(range(len(df))):
                dc = locate_potential_duplicate(df.loc[i, 'name'],
                                                df.loc[i, 'zip'], client,
                                                self.check_collection)
                if dc is not False:
                    if check_similarity(df.loc[i, 'name'], dc):
                        found_duplicates.append(i)
            duplicate_df = df.loc[found_duplicates].reset_index(drop=True)
            if len(duplicate_df) > 0:
                logger.info(
                    f'inserting services dupes into the {self.source} dupe collection'
                )
                insert_services(duplicate_df.to_dict('records'), client,
                                self.dupe_collection)
            df = df.drop(found_duplicates).reset_index(drop=True)
            logger.info(f'final df shape: {df.shape}')
            self.add_required_fields(df)
            if len(df) > 0:
                insert_services(df.to_dict('records'), client,
                                self.dump_collection)
                logger.info(
                    'updating last scraped date in data-sources collection')
                client['data-sources'].update_one(
                    {"name": self.data_source_collection_name}, {
                        '$set': {
                            'last_scraped':
                            datetime.now(timezone('UTC')).replace(
                                microsecond=0).isoformat()
                        }
                    },
                    upsert=True)