コード例 #1
0
    def get_id_period(self, this_date):

        final_result = []

        #response = self.br.open(self._weekly_url)

        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        date_to = to_dt.strftime('X%d/X%m/%Y').replace('X0',
                                                       'X').replace('X', '')

        fields = {}
        fields.update(self._query_fields)
        fields[self._date_field] = date_to
        response = self.br.open(self._results_url, urllib.urlencode(fields))
        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br)

        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
        else:
            return [], None, None

        return final_result, from_dt, to_dt  # note weekly result can be legitimately empty
コード例 #2
0
    def get_id_period(self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        do_dt = to_dt  # the date being tested - can change
        rurl = urlparse.urljoin(self._search_url, self._results_page)

        for i in range(
                5
        ):  # note works backwards through all 5 possible week days as some lists are not published exactly on a Friday

            fields = {}
            fields.update(self._query_fields)
            fields[self._date_field] = do_dt.strftime(
                self._request_date_format)
            response = self.br.open(rurl + '?' + urllib.urlencode(fields))

            page_count = 0
            max_pages = (2 * self.min_id_goal /
                         10) + 20  # guard against infinite loop
            while response and page_count < max_pages:
                html = response.read()
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    self._clean_ids(result['records'])
                    final_result.extend(result['records'])
                else:
                    self.logger.debug("Empty result after %d pages",
                                      page_count)
                    break
                try:
                    new_html = self._junk_regex.sub(
                        '', html)  # remove internal junk characters
                    result = scrapemark.scrape(self._scrape_next_link,
                                               new_html, url)
                    next_link = self._space_regex.sub(
                        '', result['next_link'])  # remove all spaces
                    response = self.br.open(next_link)
                except:
                    self.logger.debug("No next link after %d pages",
                                      page_count)
                    break
            do_dt = do_dt - timedelta(
                days=1)  # try again with a different date

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result, from_dt, to_dt  # note weekly result can be legitimately empty
コード例 #3
0
    def get_id_period(self, this_date):

        final_result = []
        month = this_date.month
        year = this_date.year
        from_dt, to_dt = scrapeutils.inc_dt(
            this_date, self._period_type)  # date range of month
        """ this kludge no longer required
        # note this is not calendar month = 
        # = from Monday of the first week fully in the month
        # = to Sunday of the last week of the month (can be in the next month)
        first_dt, last_dt = scrapeutils.inc_dt(this_date, self._period_type) # date range of month  
        dummy1, from_dt = scrapeutils.inc_dt(first_dt, 'Monday') # window starts at first Monday within month
        if this_date < from_dt: # use preceding month, if target date before window start
            month = month - 1
            if month < 1:
                month = 12
                year = year - 1
            new_date = date(year, month, 1) # first day of preceding month
            first_dt, last_dt = scrapeutils.inc_dt(new_date, self._period_type) # date range of preceding month
            dummy1, from_dt = scrapeutils.inc_dt(first_dt, 'Monday') # first Monday within month
        dummy2, to_dt = scrapeutils.inc_dt(last_dt, 'Sunday') # last Sunday (can be in next month)
        self.logger.debug("Month date extent: %s %s" % (from_dt.isoformat(), to_dt.isoformat()))"""

        response = self.br.open(self._search_url)

        fields = {}
        fields.update(self._search_fields)
        fields['month'] = str(month)
        fields['year'] = str(year)

        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br)

        response = self.br.open(self._search_url, urllib.urlencode(fields))

        if response:
            html = response.read()
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        if final_result:
            return final_result, from_dt, to_dt
        else:
            return [], None, None  # monthly scraper - so empty result is always invalid
コード例 #4
0
    def get_id_period(self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)

        monyear = this_date.strftime(
            self._request_date_format)  # 4 digit MMYY string
        response = self.br.open(self._search_url + '?q=' + monyear)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                new_result = []
                for rec in result['records']:
                    if '/' + monyear + '/' in rec[
                            'uid']:  # filter out records where the search term does not appear in the correct part of the uid
                        new_result.append(rec)
                self._clean_ids(new_result)
                final_result.extend(new_result)
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                response = self.br.follow_link(text=self._next_page_link)
            except:
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        if final_result:
            return final_result, from_dt, to_dt
        else:
            return [], None, None  # monthly scraper - so empty result is always invalid
コード例 #5
0
    def get_id_period (self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        
        fields = {}
        fields.update(self._query_fields)
        fields [self._date_field['month']] = str(this_date.month)
        fields [self._date_field['year']] = str(this_date.year)

        page_count = 0
        max_pages = (2 * self.min_id_goal / 5) + 20 # guard against infinite loop  
        for i in range(1, 3):
            fields['Status'] = str(i)
            #response = self.br.open(self._results_url + '?' + urllib.urlencode(fields))
            this_url = self._results_url + '?' + urllib.urlencode(fields)
            response = self.rs.get(this_url, verify=False, timeout=self._timeout)
            while response and page_count < max_pages:
                html, url = self._get_html(response)
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    self._clean_ids(result['records'])
                    final_result.extend(result['records'])
                else:
                    self.logger.debug("Empty result after %d pages", page_count)
                    break
                try:
                    result = scrapemark.scrape(self._scrape_next_link, html, url)
                    #response = self.br.open(result['next_link'])
                    response = self.rs.get(result['next_link'], verify=False, timeout=self._timeout)
                except:
                    self.logger.debug("No next link after %d pages", page_count)
                    break
            
        if page_count >= max_pages:
            self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
        if final_result:
            return final_result, from_dt, to_dt
        else:
            return [], None, None # monthly scraper - so empty result is always invalid
コード例 #6
0
    def get_id_period(self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        timestamp = (datetime(to_dt.year, to_dt.month, to_dt.day) -
                     datetime(1970, 1, 1)).total_seconds()
        self.logger.debug("Timestamp: %f" % timestamp)
        fields = {}
        fields.update(self._query_fields)
        fields[self._date_field] = str(int(timestamp * 1000))
        response = self.br.open(self._search_url, urllib.urlencode(fields))

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break  # note weekly result allowed to be legitimately empty (for example in the current week)
            try:
                response = self.br.follow_link(text=self._next_page_link)
            except:
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result, from_dt, to_dt  # weekly scraper - so empty result can be valid
コード例 #7
0
    def get_id_period(self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)

        url = self._result_url + urllib.quote_plus(
            to_dt.strftime(self._request_date_format))
        self.logger.debug("Search url: %s" % url)
        response = self.br.open(url)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                response = self.br.follow_link(text=self._next_page_link)
            except:
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result, from_dt, to_dt  # weekly scraper - so empty result can be valid