Example #1
0
def test_process_records():
    # Check records.
    assert transform.process_row(
        DUMMY_FETCH_ROW_1) == DUMMY_FETCH_ROW_1_EXPECTED
    assert transform.process_row(
        DUMMY_FETCH_ROW_2) == DUMMY_FETCH_ROW_2_EXPECTED
    assert transform.process_row(MIXED_CASE_PUBID) == transform.AdsRecord(
        supplier_domain='advertising.com',
        pub_id='78AbC123DeFasGFG',
        supplier_relationship='reseller',
        cert_authority=None)
Example #2
0
def test_process_row_comments():
    # Check comments.
    assert transform.process_row(DUMMY_FETCH_ROW_COMMENT) is None

    assert transform.process_row(
        DUMMY_FETCH_ROW_2_COMMENT) == DUMMY_FETCH_ROW_2_EXPECTED
    assert transform.process_row(
        DUMMY_SHORT_ROW_END_COMMENT) == DUMMY_SHORT_ROW_EXPECTED
    assert transform.process_row(
        DUMMY_SHORT_BAD_RESELLER) == DUMMY_SHORT_ROW_EXPECTED
    assert transform.process_row(
        DUMMY_SHORT_BAD_RESELLER) == DUMMY_SHORT_ROW_EXPECTED
Example #3
0
def test_reseller_direct_none_extraction():
    reseller_caps = "foo, 123123, RESELLER"
    reseller_lower = "foo, 123123, reseller"

    direct_caps = "foo, 123123, DIRECT"
    direct_lower = "foo, 123123, direct"

    other_stuff = "foo, 123123, blah"

    assert transform.process_row(
        reseller_caps).supplier_relationship == 'reseller'
    assert transform.process_row(
        reseller_lower).supplier_relationship == 'reseller'

    assert transform.process_row(direct_caps).supplier_relationship == 'direct'
    assert transform.process_row(
        direct_lower).supplier_relationship == 'direct'

    assert transform.process_row(other_stuff) is None
Example #4
0
def test_return_row_too_small():
    assert transform.process_row(
        "foo.com, 1231312") is None, "Assert too small rows are invalid."
Example #5
0
def test_process_row_variables():
    # Check variables.
    assert transform.process_row(
        DUMMY_FETCH_VARIABLE_1) == DUMMY_FETCH_VARIABLE_1_EXPECTED
    assert transform.process_row(
        DUMMY_FETCH_VARIABLE_2) == DUMMY_FETCH_VARIABLE_2_EXPECTED
Example #6
0
    def process_domain(self, fetchdata: fetch.FetchResponse) -> None:
        """Process a domains FetchResponse into inserted records and variables.

        Pipeline roughly goes as follows.
        1. Check FetchResponse data is valid, if not update scraped_at
            and return.  If it is valid, update the db_domain details we have.
        2. Iterate through response tuple, checking what's currently in the
            database so we don't insert duplicate records.
        3. Try to commit
        Args:
            fetchdata (FetchResponse): Named tuple of fetch data.

        Returns:
            None
        """
        # Setup a new SQL session.
        session = self._session(bind=self.engine)

        # Fetch domain from database. This should always exist and will
        # raise an sqlalchemy.orm.exc.NoResultFound if nothing is found.
        db_domain = session.query(
            models.Domain).filter_by(name=fetchdata.domain).one()

        LOG.debug('Processing fetchdata for %r', fetchdata.domain)
        LOG.debug('Using %r as db_domain.', db_domain)

        # If we've got bad data from an endpoint, log this and return.
        if not fetchdata.response or not fetchdata.adstxt_present:
            # TODO: Passback more debug data on failure from fetches.
            LOG.debug('Bad AdsTxt file found, updating TTLs and returning.')
            # Update the last updated at row so we don't try and
            # update the record again too soon.
            db_domain.last_updated = fetchdata.scraped_at
            # This is set to null at creation, explicitly set to False as we
            # know that there is not one now.
            db_domain.adstxt_present = False
            session.add(db_domain)
            session.commit()
            return
        # Else we've got a valid record from Fetch.  Update the db_domain
        # details we hold locally but don't commit until the end.
        else:
            db_domain.last_updated = fetchdata.scraped_at
            db_domain.adstxt_present = True
            session.add(db_domain)

        # We want to look back and verify that all of these exist.
        processed_records = []
        for row in fetchdata.response:
            # Transform the rows and add them to a list to validate against.
            processed_row = transform.process_row(row)

            # Check to see what the row is returning and process.
            if isinstance(processed_row, transform.AdsRecord):

                # Keep a list of records to compare back with.
                processed_records.append(processed_row)

                # Check for presence of record in existing Record table.
                # If if does then skip to the next record.
                try:
                    record_exists = session.query(models.Record).filter_by(
                        domain=db_domain,
                        supplier_domain=processed_row.supplier_domain,
                        pub_id=processed_row.pub_id,
                        supplier_relationship=processed_row.
                        supplier_relationship,
                        cert_authority=processed_row.cert_authority
                    ).one_or_none()

                # Something in the query was bad. Skip to the next record.
                except SQLAlchemyError as excpt:
                    LOG.exception('Unprocessible row. %r is bad due to %r',
                                  processed_row, excpt)
                    continue

                # If the record isn't present insert with fetchdata.
                if not record_exists:
                    db_record = models.Record(
                        domain_id=db_domain.id,
                        supplier_domain=processed_row.supplier_domain,
                        pub_id=processed_row.pub_id,
                        supplier_relationship=processed_row.
                        supplier_relationship,
                        cert_authority=processed_row.cert_authority,
                        first_seen=fetchdata.scraped_at,
                        active=True)
                    LOG.debug('Adding new record to database, %r', db_record)
                    try:
                        session.add(db_record)
                    except DBAPIError:
                        LOG.error('Unable to insert... %r', db_record)
                # If the record does exist check to ensure it's active.
                else:
                    # It's not active so reactivate the record.
                    if not record_exists.active:
                        record_exists.active = True
                        session.commit()
                        LOG.debug(
                            'Record was found to be inactive, reactivating...')

            elif isinstance(processed_row, transform.AdsVariable):
                # Check for presence of variable in Variable table.
                # If it does then skip to next record.
                variable_exists = session.query(models.Variable).filter_by(
                    domain=db_domain, key=processed_row.key).first()

                if not variable_exists:
                    LOG.debug('New variable %r inserted for %r',
                              db_domain.name, processed_row.key)
                    db_variable = models.Variable(domain_id=db_domain.id,
                                                  key=processed_row.key,
                                                  value=processed_row.value)
                    session.add(db_variable)
                elif variable_exists.value != processed_row.value:
                    LOG.debug('Key %r for %r has been updated.',
                              variable_exists.key, db_domain.name)
                    variable_exists.value = processed_row.value
                    session.add(variable_exists)
                else:
                    # Check is there and is up to date.
                    continue
            # Else it's nil, skip to next record.
            else:
                continue

        # Validate that evereything in the records table is also in our list
        # of processed rows.  Run through the record table then variables.
        active_records = session.query(
            models.Record.supplier_domain, models.Record.pub_id,
            models.Record.supplier_relationship,
            models.Record.cert_authority).filter_by(domain_id=db_domain.id,
                                                    active=True).all()

        # Find what's in active_records but is not in processed_records.
        active_records_not_seen = set(active_records).difference(
            set(processed_records))
        # Set all of these records as inactive.
        for record in active_records_not_seen:
            LOG.debug('%r was found to be inactive.', record)
            session.query(models.Record).filter_by(
                domain_id=db_domain.id,
                supplier_domain=record.supplier_domain,
                pub_id=record.pub_id,
                supplier_relationship=record.supplier_relationship,
                cert_authority=record.cert_authority).one().active = False

        # Domain is completely processed at this point.  Commit all records.
        session.commit()
        LOG.debug('Session commited and domain processed.')