def test_process_records(): # Check records. assert transform.process_row( DUMMY_FETCH_ROW_1) == DUMMY_FETCH_ROW_1_EXPECTED assert transform.process_row( DUMMY_FETCH_ROW_2) == DUMMY_FETCH_ROW_2_EXPECTED assert transform.process_row(MIXED_CASE_PUBID) == transform.AdsRecord( supplier_domain='advertising.com', pub_id='78AbC123DeFasGFG', supplier_relationship='reseller', cert_authority=None)
def test_process_row_comments(): # Check comments. assert transform.process_row(DUMMY_FETCH_ROW_COMMENT) is None assert transform.process_row( DUMMY_FETCH_ROW_2_COMMENT) == DUMMY_FETCH_ROW_2_EXPECTED assert transform.process_row( DUMMY_SHORT_ROW_END_COMMENT) == DUMMY_SHORT_ROW_EXPECTED assert transform.process_row( DUMMY_SHORT_BAD_RESELLER) == DUMMY_SHORT_ROW_EXPECTED assert transform.process_row( DUMMY_SHORT_BAD_RESELLER) == DUMMY_SHORT_ROW_EXPECTED
def test_reseller_direct_none_extraction(): reseller_caps = "foo, 123123, RESELLER" reseller_lower = "foo, 123123, reseller" direct_caps = "foo, 123123, DIRECT" direct_lower = "foo, 123123, direct" other_stuff = "foo, 123123, blah" assert transform.process_row( reseller_caps).supplier_relationship == 'reseller' assert transform.process_row( reseller_lower).supplier_relationship == 'reseller' assert transform.process_row(direct_caps).supplier_relationship == 'direct' assert transform.process_row( direct_lower).supplier_relationship == 'direct' assert transform.process_row(other_stuff) is None
def test_return_row_too_small(): assert transform.process_row( "foo.com, 1231312") is None, "Assert too small rows are invalid."
def test_process_row_variables(): # Check variables. assert transform.process_row( DUMMY_FETCH_VARIABLE_1) == DUMMY_FETCH_VARIABLE_1_EXPECTED assert transform.process_row( DUMMY_FETCH_VARIABLE_2) == DUMMY_FETCH_VARIABLE_2_EXPECTED
def process_domain(self, fetchdata: fetch.FetchResponse) -> None: """Process a domains FetchResponse into inserted records and variables. Pipeline roughly goes as follows. 1. Check FetchResponse data is valid, if not update scraped_at and return. If it is valid, update the db_domain details we have. 2. Iterate through response tuple, checking what's currently in the database so we don't insert duplicate records. 3. Try to commit Args: fetchdata (FetchResponse): Named tuple of fetch data. Returns: None """ # Setup a new SQL session. session = self._session(bind=self.engine) # Fetch domain from database. This should always exist and will # raise an sqlalchemy.orm.exc.NoResultFound if nothing is found. db_domain = session.query( models.Domain).filter_by(name=fetchdata.domain).one() LOG.debug('Processing fetchdata for %r', fetchdata.domain) LOG.debug('Using %r as db_domain.', db_domain) # If we've got bad data from an endpoint, log this and return. if not fetchdata.response or not fetchdata.adstxt_present: # TODO: Passback more debug data on failure from fetches. LOG.debug('Bad AdsTxt file found, updating TTLs and returning.') # Update the last updated at row so we don't try and # update the record again too soon. db_domain.last_updated = fetchdata.scraped_at # This is set to null at creation, explicitly set to False as we # know that there is not one now. db_domain.adstxt_present = False session.add(db_domain) session.commit() return # Else we've got a valid record from Fetch. Update the db_domain # details we hold locally but don't commit until the end. else: db_domain.last_updated = fetchdata.scraped_at db_domain.adstxt_present = True session.add(db_domain) # We want to look back and verify that all of these exist. processed_records = [] for row in fetchdata.response: # Transform the rows and add them to a list to validate against. processed_row = transform.process_row(row) # Check to see what the row is returning and process. if isinstance(processed_row, transform.AdsRecord): # Keep a list of records to compare back with. processed_records.append(processed_row) # Check for presence of record in existing Record table. # If if does then skip to the next record. try: record_exists = session.query(models.Record).filter_by( domain=db_domain, supplier_domain=processed_row.supplier_domain, pub_id=processed_row.pub_id, supplier_relationship=processed_row. supplier_relationship, cert_authority=processed_row.cert_authority ).one_or_none() # Something in the query was bad. Skip to the next record. except SQLAlchemyError as excpt: LOG.exception('Unprocessible row. %r is bad due to %r', processed_row, excpt) continue # If the record isn't present insert with fetchdata. if not record_exists: db_record = models.Record( domain_id=db_domain.id, supplier_domain=processed_row.supplier_domain, pub_id=processed_row.pub_id, supplier_relationship=processed_row. supplier_relationship, cert_authority=processed_row.cert_authority, first_seen=fetchdata.scraped_at, active=True) LOG.debug('Adding new record to database, %r', db_record) try: session.add(db_record) except DBAPIError: LOG.error('Unable to insert... %r', db_record) # If the record does exist check to ensure it's active. else: # It's not active so reactivate the record. if not record_exists.active: record_exists.active = True session.commit() LOG.debug( 'Record was found to be inactive, reactivating...') elif isinstance(processed_row, transform.AdsVariable): # Check for presence of variable in Variable table. # If it does then skip to next record. variable_exists = session.query(models.Variable).filter_by( domain=db_domain, key=processed_row.key).first() if not variable_exists: LOG.debug('New variable %r inserted for %r', db_domain.name, processed_row.key) db_variable = models.Variable(domain_id=db_domain.id, key=processed_row.key, value=processed_row.value) session.add(db_variable) elif variable_exists.value != processed_row.value: LOG.debug('Key %r for %r has been updated.', variable_exists.key, db_domain.name) variable_exists.value = processed_row.value session.add(variable_exists) else: # Check is there and is up to date. continue # Else it's nil, skip to next record. else: continue # Validate that evereything in the records table is also in our list # of processed rows. Run through the record table then variables. active_records = session.query( models.Record.supplier_domain, models.Record.pub_id, models.Record.supplier_relationship, models.Record.cert_authority).filter_by(domain_id=db_domain.id, active=True).all() # Find what's in active_records but is not in processed_records. active_records_not_seen = set(active_records).difference( set(processed_records)) # Set all of these records as inactive. for record in active_records_not_seen: LOG.debug('%r was found to be inactive.', record) session.query(models.Record).filter_by( domain_id=db_domain.id, supplier_domain=record.supplier_domain, pub_id=record.pub_id, supplier_relationship=record.supplier_relationship, cert_authority=record.cert_authority).one().active = False # Domain is completely processed at this point. Commit all records. session.commit() LOG.debug('Session commited and domain processed.')