Example #1
0
        def get_finalcheck_offset():
            """ Get the finalcheck offset from the last spider run """

            settings = DatabaseUtil(
                self.site,
                self.spider).load_settings(fields="finalcheck_offset")
            return int(settings['finalcheck_offset'])
Example #2
0
    def get_finalcheck_urls(self, old_days, block_size):
        """ Get a block_size of rechecking URLs which adapt old_days constrain, returns an array of url_ids """
        def get_finalcheck_offset():
            """ Get the finalcheck offset from the last spider run """

            settings = DatabaseUtil(
                self.site,
                self.spider).load_settings(fields="finalcheck_offset")
            return int(settings['finalcheck_offset'])

        def write_finalcheck_offset(setting):
            """ Write the finalcheck offset for this run of the spider """

            DatabaseUtil(self.site,
                         self.spider).write_settings(field="finalcheck_offset",
                                                     value=setting)

        # call inner method to get finalcheck_offset
        finalcheck_offset = get_finalcheck_offset()
        url_ids = DatabaseUtil(self.site, self.spider).get_checking_urls(
            old_days, block_size, finalcheck_offset, 'H', 'D')
        # if any accepted url_id
        if len(url_ids) > 0:
            # call inner method update a new finalcheck offset
            write_finalcheck_offset(finalcheck_offset + len(url_ids))
        else:
            # call inner method to reset finalcheck offset
            write_finalcheck_offset(0)

        return url_ids
Example #3
0
    def load_settings(self):
        """ Load settings (active, block_size, main_startid, recon_startid) from last spider run. \
        Grab defaults if none are present, returns a dict of settings
        """

        settings = DatabaseUtil(self.site, self.spider).load_settings(
            fields="active, block_size, main_startid, recon_startid")
        return {'active': settings['active'], 'block_size': int(settings['block_size']), 'main_startid': int(settings['main_startid']),\
            'recon_startid': int(settings['recon_startid'])}
Example #4
0
    def write_active(self, setting):
        """ write status of activating """

        # if spider is recheck of finalcheck
        if 'check' in self.spider:
            active_field = 'recheck_active'
        else:
            # spder is recon or main
            active_field = 'active'

        # Call write_settings method to make transaction
        DatabaseUtil(self.site, self.spider).write_settings(field=active_field,
                                                            value=setting)
Example #5
0
    def load_settings(self):
        """ Load settings (recheck_active, finalcheck_olddays, block_size) from last finalcheck spider run. \
        Grab defaults if none are present, returns a dict of settings 
        """

        settings = DatabaseUtil(self.site, self.spider).load_settings(
            fields="recheck_active, finalcheck_olddays, block_size")
        return {
            'active':
            settings['recheck_active'],
            'url_ids':
            self.get_finalcheck_urls(settings['finalcheck_olddays'],
                                     settings['block_size'])
        }
Example #6
0
    def write_startid(self, setting):
        """ Write the new startid setting for this run of the spider """

        DatabaseUtil(self.site,
                     self.spider).write_settings(field="main_startid",
                                                 value=setting)
Example #7
0
    def write_cycles(self, setting):
        """ Write the cycles setting for this run of the spider """

        DatabaseUtil(self.site, self.spider).write_settings(field="cycles",
                                                            value=setting)
Example #8
0
    def initialize_settings(self):
        """ initial settings for a new site """

        DatabaseUtil(self.site, self.spider).initialize_settings()
Example #9
0
        def write_finalcheck_offset(setting):
            """ Write the finalcheck offset for this run of the spider """

            DatabaseUtil(self.site,
                         self.spider).write_settings(field="finalcheck_offset",
                                                     value=setting)
Example #10
0
def get_ids_for_vin(site, block_size):
    """ get url_ids to get vins """

    return DatabaseUtil(site).get_ids_for_vin(block_size)
Example #11
0
def extract_YMMT(data):
    """
        parse description to get year, make, model and trim from the description
        returns a dict of them or -1 if not found any make
    """

    # a hard-coded list of makes to match make in description
    standard_makes = (
        'Acura',
        'Alfa Romeo',
        'AMC',
        'Aston Martin',
        'Audi',
        'Avanti',
        'Bentley',
        'BMW',
        'Buick',
        'Cadillac',
        'Chevrolet',
        'Chrysler',
        'Daewoo',
        'Daihatsu',
        'Datsun',
        'DeLorean',
        'Dodge',
        'Eagle',
        'Ferrari',
        'Fiat',
        'Fisker',
        'Ford',
        'Freightliner',
        'Geo',
        'GMC',
        'Honda',
        'Hummer',
        'Hyundai',
        'Infiniti',
        'Isuzu',
        'Jaguar',
        'Jeep',
        'Kia',
        'Lamborghini',
        'Lancia',
        'Land Rover',
        'Lexus',
        'Lincoln',
        'Lotus',
        'Maserati',
        'Maybach',
        'Mazda',
        'McLaren',
        'Mercedes-Benz',
        'Mercury',
        'Merkur',
        'Mini',
        'Mitsubishi',
        'Nissan',
        'Oldsmobile',
        'Peugeot',
        'Plymouth',
        'Pontiac',
        'Porsche',
        'Renault',
        'Rolls-Royce',
        'Saab',
        'Saturn',
        'Scion',
        'Smart',
        'SRT',
        'Sterling',
        'Subaru',
        'Suzuki',
        'Tesla',
        'Toyota',
        'Triumph',
        'Volkswagen',
        'Volvo',
        'Yugo',
        'Ram',
    )

    # looking for the year in the description
    year = re.search(r'(\d+)', data).group(1)

    make = None
    # looking for make in the manual list
    for m in standard_makes:
        if m in data:
            make = m
            break
        elif m.upper() in data:
            make = m.upper()
            break
    if not make:
        # Can't found any make, exit the method here
        return -1

    data = data.replace(year, '', 1)
    data = data.replace(make, '', 1).strip()

    model = ""
    trim = ""
    # Generate all ngrams from the description to match make and model pair
    ngrams = generate_ngrams(data)

    # Load all models of the make from the DB
    all_models = DatabaseUtil().get_all_models(make)

    for gram in ngrams:
        found = False
        for each in all_models:
            # try to match make from the description with one of them in the DB
            if each.lower().strip() == gram.lower().strip():
                model = gram
                # Extract trim after model's place
                try:
                    trim = re.search(model + r'(.+)', data).group(1).strip()
                except:
                    pass
                found = True
                break
        if found:
            break

    return {
        'year': year,
        'make': make.strip(),
        'model': model.strip(),
        'trim': trim.strip()
    }
Example #12
0
def generate_ids(site):
    """ Generate ids for recon spider """

    from array import array

    # Load settings of recon_startid, block_size, cycles, cycles_limit, overs
    settings = DatabaseUtil(site).load_settings(
        fields="recon_startid, block_size, cycles, cycles_limit, overs")
    # pass settings to variables
    old_startid = int(settings['recon_startid']) + 1
    cycles = int(settings['cycles'])
    cycles_limit = int(settings['cycles_limit'])
    block_size = int(settings['block_size'])
    overs = int(settings['overs'])

    if cycles >= cycles_limit:
        overs += 1

    if cycles / 100 > 0:
        cycles = cycles / 100

    # Get the 10's place digit for cycles
    cycles1 = cycles % 10

    # Get the 100's place digit from cycles
    cycles2 = (cycles / 10) % 10

    # How many ID's to skip
    skip = 17

    # Where the new spider starts. Use cycles1 to determine which 10's digit to scan
    new_startid = int(old_startid + cycles1)

    # How far to jump ahead. Use cycles2 to determine how far out to recon
    recon_startid = int(new_startid + cycles2 * block_size)

    # Pretty obvious
    end_id = int(recon_startid + block_size)

    # Number of cycles before going back and checking the very first range again
    backcheck = 5

    # Create array (less memory than a list) of integers to generate urls from
    recon_list = array('i', (xrange(recon_startid, end_id, skip)))

    # If cycles are more that 50, then double back and check the first group without jumping
    # Dived the skip number by 2 to double the intensity of check the first block
    if cycles >= backcheck:
        backcheck_list = array(
            'i', (xrange(new_startid, new_startid + skip / 2, skip)))
        recon_list.extend(backcheck_list)

    recon_list = array('i', (id + overs for id in recon_list))

    # update a new overs if it needed
    if overs != int(settings['overs']):
        if overs > 10:
            overs = 0
        DatabaseUtil(site).write_settings(field="overs", value=overs)

    return recon_list