Ejemplo n.º 1
0
    def get_id_batch (self, date_from, date_to):

        this_dt = date_from
        final_result = []
        
        while this_dt <= date_to:
            
            response = self.br.open(self._search_url)
            #self.logger.debug("Start html: %s", response.read())
    
            fields = {}
            fields[self._date_from_field] = this_dt.strftime(self._request_date_format)
            fields[self._date_to_field] = this_dt.strftime(self._request_date_format)
            scrapeutils.setup_form(self.br, self._search_form, fields)
            #self.logger.debug("ID batch form: %s", str(self.br.form))
            response = scrapeutils.submit_form(self.br)
            
            html = response.read()
            #self.logger.debug("ID batch page html: %s", html)
            try:
                result = scrapemark.scrape(self._scrape_max_recs, html)
                max_recs = int(result['max_recs'])
            except:
                max_recs = 0
            
            interim_result = []
            page_count = 0
            max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
            while response and len(interim_result) < max_recs and page_count < max_pages:
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    for rec in result['records']:
                        rec[self._date_type] = fields[self._date_to_field]
                    self._clean_ids(result['records'])
                    interim_result.extend(result['records'])
                else:
                    self.logger.debug("Empty result after %d pages", page_count)
                    break
                if len(interim_result) >= max_recs: break
                try:
                    result = scrapemark.scrape(self._scrape_next_submit, html)
                    next_submit = result['next_submit']
                    scrapeutils.setup_form(self.br, self._search_form)
                    self.logger.debug("ID next form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br, next_submit)
                    html = response.read()
                except: # failure to find next page link at end of page sequence here
                    self.logger.debug("No next form link after %d pages", page_count)
                    break
                    
            if page_count >= max_pages:
                self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
            final_result.extend(interim_result)
            this_dt += timedelta(days=1)
            
        return final_result
Ejemplo n.º 2
0
 def get_html_from_uid(self, uid):
     response = self.br.open(self._search_url)
     #self.logger.debug("ID detail start html: %s", response.read())
     fields = self._applic_fields
     fields[self._ref_field] = uid
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br, self._search_submit)
     html = response.read()
     sub_html = self._BADCHARS_REGEX.sub(' ', html)
     #self.logger.debug("detail page html: %s", sub_html)
     expired = scrapemark.scrape(self._scrape_expired, sub_html)
     while expired:
         response = self.br.reload()
         html = response.read()
         sub_html = self._BADCHARS_REGEX.sub(' ', html)
         expired = scrapemark.scrape(self._scrape_expired, sub_html)
     url = response.geturl()
     result = scrapemark.scrape(self._scrape_ids, sub_html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         for r in result['records']:
             if r.get('uid', '') == uid and r.get('url'):
                 self.logger.debug("Scraped url: %s", r['url'])
                 return self.get_html_from_url(r['url'])
     return None, None
Ejemplo n.º 3
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        for case in self._case_prefixes:

            interim_result = []
            response = self.br.open(self._search_url)
            #self.logger.debug("Start html: %s", response.read())

            fields = {self._ref_field: case}
            fields[self._date_from_field] = date_from.strftime(
                self._request_date_format)
            fields[self._date_to_field] = date_to.strftime(
                self._request_date_format)
            scrapeutils.setup_form(self.br, self._search_form, fields)
            self.logger.debug("ID batch form: %s", str(self.br.form))
            response = scrapeutils.submit_form(self.br)

            page_count = 0
            max_pages = (2 * self.min_id_goal /
                         10) + 20  # guard against infinite loop
            while response and page_count < max_pages:
                html = response.read()
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    self._clean_ids(result['records'])
                    interim_result.extend(result['records'])
                elif not interim_result:  # is it a single record?
                    single_result = scrapemark.scrape(self._scrape_one_id,
                                                      html, url)
                    if single_result:
                        self._clean_record(single_result)
                        interim_result = [single_result]
                        break
                else:
                    self.logger.debug("Empty result after %d pages",
                                      page_count)
                    break
                try:
                    result = scrapemark.scrape(self._scrape_next_link, html,
                                               url)
                    response = self.br.open(result['next_link'])
                except:
                    self.logger.debug("No next link after %d pages",
                                      page_count)
                    break

            if page_count >= max_pages:
                self.logger.warning(
                    "Too many page requests - %d - probable run away loop" %
                    page_count)

            final_result.extend(interim_result)

        return final_result
Ejemplo n.º 4
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        id_list = []
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                # add IDs one by one and test for duplicates
                for r in result['records']:
                    if r['uid'] not in id_list:
                        final_result.append(r)
                        id_list.append(r['uid'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                response = self.br.follow_link(text=self._next_link)
                html = response.read()
                #self.logger.debug("ID next page html: %s", html)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 5
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s" % response.read())

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0  # note max recs is in the footer which is omitted if only one page of results

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            if max_recs == 0:  # one page, no footer
                result = scrapemark.scrape(self._scrape_ids_no_foot, html, url)
            else:
                result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if max_recs == 0 or len(final_result) >= max_recs: break
            try:
                result = scrapemark.scrape(self._scrape_next_submit, html)
                scrapeutils.setup_form(self.br, self._search_form)
                #self.logger.debug("Next page form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br,
                                                   result['next_submit'])
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 6
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        page_count = 0

        new_date_from = date_from - timedelta(
            days=1)  # NB from date is exclusive
        dfrom = new_date_from.strftime(self._request_date_format)
        dto = date_to.strftime(self._request_date_format)
        url = self._search_url + '?' + self._page_params % (dfrom, dto,
                                                            page_count * 10)
        self.logger.debug("Start URL: %s", url)
        response = self.br.open(url)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs1, html)
            max_recs = int(result['max_recs'])
        except:
            try:
                result = scrapemark.scrape(self._scrape_max_recs2, html)
                max_recs = int(result['max_recs'])
            except:
                max_recs = 0

        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                url = self._search_url + '?' + self._page_params % (
                    dfrom, dto, page_count * 10)
                self.logger.debug("Next URL: %s", url)
                response = self.br.open(url)
                html = response.read()
            except:  # failure to find next page link at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 7
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        fields[self._uid_field] = ''

        self.logger.debug("Fields: %s", str(fields))
        url = self._search_url + '?' + urllib.urlencode(fields)
        response = self.br.open(url)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                nextp = (page_count * 20) + 1
                fields['is_NextRow'] = str(nextp)
                self.logger.debug("Next fields: %s", str(fields))
                url = self._search_url + '?' + urllib.urlencode(fields)
                response = self.br.open(url)
                html = response.read()
                #self.logger.debug("ID next page html: %s", html)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 8
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        new_date_from = date_from - timedelta(
            days=1)  # start date is exclusive, decrement start date by one day
        date_to = date_to + timedelta(
            days=1)  # end date is exclusive, increment end date by one day
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields[self._date_from_field] = new_date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                result = scrapemark.scrape(self._scrape_next, html, url)
                response = self.br.open(result['next_link'])
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 9
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        self._adjust_response(response)
        #self.logger.debug("First page html: %s", response.read())

        self.logger.debug(scrapeutils.list_forms(self.br))

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        action = self.br.form.action
        self.br.form.action = action.replace('https://', 'http://')
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                if 'next_link' in self._scrape_next:
                    result = scrapemark.scrape(self._scrape_next, html, url)
                    response = self.br.open(result['next_link'])
                else:
                    scrapeutils.setup_form(self.br, self._scrape_next)
                    action = self.br.form.action
                    self.br.form.action = action.replace('https://', 'http://')
                    self.logger.debug("ID next form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form/link after %d pages",
                                  page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 10
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        response = self.br.open(self._search_url + '?' +
                                urllib.urlencode(fields))

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            elif not final_result:  # is it a single record?
                single_result = scrapemark.scrape(self._scrape_id, html, url)
                if single_result:
                    single_result['url'] = url
                    self._clean_record(single_result)
                    return [single_result]
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                response = self.br.follow_link(text=self._link_next)
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 11
0
    def get_id_records(self, request_from, request_to, max_recs):
        if not request_from or not request_to or not max_recs:
            return [], None, None  # if any parameter invalid - try again next time
        from_rec = int(request_from)
        to_rec = int(request_to)
        num_recs = int(max_recs)
        if from_rec < 1:
            if to_rec < 1:  # both too small
                return [], None, None
            from_rec = 1
        if to_rec > num_recs:
            if from_rec > num_recs:  # both too large
                return [], None, None
            to_rec = num_recs

        final_result = []
        rfrom = None
        rto = None
        n = to_rec - from_rec + 1
        if self.over_sequence(to_rec):  # at max sequence and gathering forward
            ii, yy, from_rec = self.split_sequence(from_rec, True)
        else:
            ii, yy, from_rec = self.split_sequence(from_rec, False)
        to_rec = from_rec + n - 1
        in_current_year = False
        this_year = date.today().year
        for i in range(from_rec, to_rec + 1):
            index, year, new_seq = self.split_sequence(i)
            if year == this_year and index > 0:
                in_current_year = True
            if rfrom is None:
                rfrom = i
            rto = new_seq
            found = False
            for prefix in self._prefixes:
                uid = prefix + self.get_uid(index, year)
                html, url = self.get_html_from_uid(uid)
                result = scrapemark.scrape(self._scrape_min_data, html)
                if result and result.get('reference'):
                    final_result.append({'url': url, 'uid': uid})
                    found = True
                    break
            if not found:
                result = scrapemark.scrape(self._scrape_invalid_format, html)
                if result and result.get('invalid_format'):
                    self.logger.debug(
                        "No valid record for uid ?/%s/%s" %
                        (str(year), str(index).zfill(self._index_digits)))
                else:
                    return [], None, None  # not recognised as bad data - something is wrong - exit

        if not in_current_year or final_result:
            return final_result, rfrom, rto
        else:
            return [], None, None  # empty result is invalid if any of the results are in the current year
Ejemplo n.º 12
0
    def get_id_batch(self, date_from, date_to):

        new_date_to = date_to + timedelta(
            days=1)  # increment end date by one day
        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("ID batch start html: %s", response.read())

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = new_date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                for res in result['records']:
                    if res.get('uid'):  # one uid on 1 dec 2015 is empty
                        final_result.append(res)
            elif not final_result:  # is it a single record?
                single_result = scrapemark.scrape(self._scrape_one_id, html,
                                                  url)
                if single_result:
                    self._clean_record(single_result)
                    final_result = [single_result]
                    break
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                result = scrapemark.scrape(self._scrape_next_link, html, url)
                response = self.br.open(result['next_link'])
            except:
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 13
0
    def get_id_period(self, this_date):

        final_result = []
        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        do_dt = to_dt  # the date being tested - can change
        rurl = urlparse.urljoin(self._search_url, self._results_page)

        for i in range(
                5
        ):  # note works backwards through all 5 possible week days as some lists are not published exactly on a Friday

            fields = {}
            fields.update(self._query_fields)
            fields[self._date_field] = do_dt.strftime(
                self._request_date_format)
            response = self.br.open(rurl + '?' + urllib.urlencode(fields))

            page_count = 0
            max_pages = (2 * self.min_id_goal /
                         10) + 20  # guard against infinite loop
            while response and page_count < max_pages:
                html = response.read()
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    self._clean_ids(result['records'])
                    final_result.extend(result['records'])
                else:
                    self.logger.debug("Empty result after %d pages",
                                      page_count)
                    break
                try:
                    new_html = self._junk_regex.sub(
                        '', html)  # remove internal junk characters
                    result = scrapemark.scrape(self._scrape_next_link,
                                               new_html, url)
                    next_link = self._space_regex.sub(
                        '', result['next_link'])  # remove all spaces
                    response = self.br.open(next_link)
                except:
                    self.logger.debug("No next link after %d pages",
                                      page_count)
                    break
            do_dt = do_dt - timedelta(
                days=1)  # try again with a different date

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result, from_dt, to_dt  # note weekly result can be legitimately empty
Ejemplo n.º 14
0
 def max_sequence(self):
     response = self.br.open(self._search_url)  # one fixed page of records
     html = response.read()
     url = response.geturl()
     result1 = scrapemark.scrape(self._scrape_ids, html, url)
     result2 = scrapemark.scrape(self._scrape_ids_withdrawn, html,
                                 url)  # no longer listed
     total = 0
     if result1 and result1.get('records'):
         total += len(result1['records'])
     if result2 and result2.get('records'):
         total += len(result2['records'])
     return total if total else None
Ejemplo n.º 15
0
    def get_id_batch (self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        
        scrapeutils.setup_form(self.br, self._search_form)
        self.logger.debug("Start form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        
        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)
        html = response.read()
        
        runaway_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
        try:
            result = scrapemark.scrape(self._scrape_max_pages, html)
            max_pages = int(result['max_pages'])
        except:
            max_pages = runaway_pages
            
        page_count = 0
        while html and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if page_count >= max_pages: break
            try:
                scrapeutils.setup_form(self.br, self._search_form)
                self.logger.debug("ID next form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br, self._next_submit)
                html = response.read()
            except: # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break
        
        if page_count >= runaway_pages:
            self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
        
        return final_result
Ejemplo n.º 16
0
    def get_id_batch (self, date_from, date_to):

        final_result = []
        
        fields = {}
        fields.update(self._search_fields)
        fields [self._date_from_field] = date_from.strftime(self._request_date_format)
        fields [self._date_to_field] = date_to.strftime(self._request_date_format)
        
        self.logger.debug("Fields: %s", str(fields))
        query = urllib.urlencode(fields)
        url = urlparse.urljoin(self._search_url, self._results_page) + '?' + query
        response = self.br.open(url)
        
        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0
        
        page_count = 0
        max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
        while response and len(final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                fields = { '__EVENTTARGET': self._next_target }
                fields['__EVENTARGUMENT'] = 'Page$' + str(page_count+1)
                scrapeutils.setup_form(self.br, self._search_form, fields)
                self.logger.debug("Next page form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br)
                html = response.read()
                #self.logger.debug("ID next page html: %s", html)
            except: # normal failure to find next page link at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break
        
        if page_count >= max_pages:
            self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
        return final_result
Ejemplo n.º 17
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        self._adjust_response(response)
        #self.logger.debug("Start html: %s", response.read())

        fields = self._search_fields
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            sub_html = self._BADCHARS_REGEX.sub(' ', html)
            ##self.logger.debug("ID batch page html: %s", sub_html)
            result = scrapemark.scrape(self._scrape_ids, sub_html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                result = scrapemark.scrape(self._scrape_next_link, sub_html,
                                           url)
                #print result
                next_url = myutils.GAPS_REGEX.sub('', result['next_link'])
                self.logger.debug("ID next url: %s", next_url)
                response = self.br.open(next_url)
                self._adjust_response(response)
            except:  # normal failure to find next page link at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 18
0
    def _get_results_pages (self, result_param, final_page = None):
        # note returns dictionary of records (+ page number) keyed by uid
        final_result = []
        
        response = self.br.open(self._results_url + '?' + result_param) 
        html = response.read()
        #self.logger.debug("Batch html: %s" % html)
        
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0
            
        page_count = 0
        max_pages = (4 * self.min_id_goal / 10) + 20 # guard against infinite loop
        if not final_page:
            end_page = max_pages
        else:
            end_page = final_page
        while html and len(final_result) < max_recs and page_count < end_page:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                recno = 0
                for r in result['records']:
                    recno += 1
                    r['pageno'] = page_count
                    r['recno'] = recno
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                result = scrapemark.scrape(self._scrape_next_submit, html, url)
                scrapeutils.setup_form(self.br, self._result_form)
                #self.logger.debug("Next form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br, result['next_submit'])
                html = response.read()
            except: # note this should never happen as we know the max_recs value
                self.logger.error("No next button after %d pages", page_count)
                return []

        if page_count >= max_pages:
            self.logger.warning("Too many page requests - %d - possible run away loop" % page_count)
            
        return final_result
Ejemplo n.º 19
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        result = scrapemark.scrape(self._scrape_max_pages, html)
        try:
            page_list = result['max_pages'].split()
            max_pages = len(page_list)
        except:
            max_pages = 1

        page_count = 0
        while response and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if page_count >= max_pages: break
            try:
                next_url = re.sub(r'pageno=\d*&',
                                  'pageno=' + str(page_count + 1) + '&', url)
                self.logger.debug("ID next url: %s", next_url)
                response = self.br.open(next_url)
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next url after %d pages", page_count)
                break

        return final_result
Ejemplo n.º 20
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        response = self.br.open(self._search_url)

        fields = {}
        fields.update(self._search_fields)
        date_from = date_from.strftime(self._request_date_format)
        date_parts = date_from.split('/')
        #fields[self._date_from_field['day']] = [ date_parts[0] ]
        #fields[self._date_from_field['month']] = [ date_parts[1] ]
        #fields[self._date_from_field['year']] = [ date_parts[2] ]
        fields[self._date_from_field['day']] = date_parts[0]
        fields[self._date_from_field['month']] = date_parts[1]
        fields[self._date_from_field['year']] = date_parts[2]
        date_to = date_to.strftime(self._request_date_format)
        date_parts = date_to.split('/')
        #fields[self._date_to_field['day']] = [ date_parts[0] ]
        #fields[self._date_to_field['month']] = [ date_parts[1] ]
        #fields[self._date_to_field['year']] = [ date_parts[2] ]
        fields[self._date_to_field['day']] = date_parts[0]
        fields[self._date_to_field['month']] = date_parts[1]
        fields[self._date_to_field['year']] = date_parts[2]
        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br, self._search_submit)

        response = self.br.open(self._search_url, urllib.urlencode(fields))

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            url = response.geturl()
            html = response.read()
            self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                scrapeutils.setup_form(self.br, self._next_form,
                                       self._next_fields)
                self.logger.debug("ID next form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 21
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        new_date_to = date_to + timedelta(
            days=1)  # end date is exclusive, increment end date by one day

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = new_date_to.strftime(
            self._request_date_format)

        self.logger.debug("Fields: %s", str(fields))
        query = urllib.urlencode(fields)
        url = urlparse.urljoin(self._iframe_url, self._results_page)
        response = self.br.open(url, query)

        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                for res in result['records']:
                    if res.get('uid'
                               ):  # sometimes there are blank uids eg 18/3/15
                        final_result.append(res)
        return final_result
Ejemplo n.º 22
0
    def get_id_records(self, request_from, request_to, max_recs):
        if not request_from or not request_to or not max_recs:
            return [], None, None  # if any parameter invalid - try again next time
        final_result = []
        from_rec = int(request_from)
        to_rec = int(request_to)
        num_recs = int(max_recs)
        if from_rec < 1:
            if to_rec < 1:  # both too small
                return [], None, None
            from_rec = 1
        if to_rec > num_recs:
            if from_rec > num_recs:  # both too large
                return [], None, None
            to_rec = num_recs

        rfrom = None
        rto = None
        for i in range(from_rec, to_rec + 1):
            uid = str(i)
            html, url = self.get_html_from_uid(uid)
            result = scrapemark.scrape(self._scrape_min_data, html)
            if result and result.get('reference'):
                final_result.append({'url': url, 'uid': uid})
                if rfrom is None:
                    rfrom = i
                rto = i
            else:
                self.logger.debug("No valid record for uid %s", uid)

        if final_result:
            return final_result, rfrom, rto
        else:
            return [], None, None  # list scraper - so empty result is always invalid
Ejemplo n.º 23
0
    def get_id_period(self, this_date):

        final_result = []

        #response = self.br.open(self._weekly_url)

        from_dt, to_dt = scrapeutils.inc_dt(this_date, self._period_type)
        date_to = to_dt.strftime('X%d/X%m/%Y').replace('X0',
                                                       'X').replace('X', '')

        fields = {}
        fields.update(self._query_fields)
        fields[self._date_field] = date_to
        response = self.br.open(self._results_url, urllib.urlencode(fields))
        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br)

        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
        else:
            return [], None, None

        return final_result, from_dt, to_dt  # note weekly result can be legitimately empty
Ejemplo n.º 24
0
 def _get_details(self, html, this_url):
     """ Scrapes detailed information for one record given html and url 
     - this is an optional hook to allow data from multiple linked pages to be merged """
     result = self._get_detail(html, this_url)
     if 'scrape_error' in result:
         return result
     try:
         temp_result = scrapemark.scrape(self._scrape_dates_link, html,
                                         this_url)
         dates_url = temp_result['dates_link']
         self.logger.debug("Dates url: %s", dates_url)
         response = self.br.open(dates_url)
         html, url = self._get_html(response)
     except:
         self.logger.warning("No link to dates page found")
     else:
         #self.logger.debug("Html obtained from dates url: %s", html)
         result2 = self._get_detail(html, url, self._scrape_dates_block,
                                    self._scrape_min_dates,
                                    self._scrape_optional_dates)
         if 'scrape_error' not in result2:
             result.update(result2)
         else:
             self.logger.warning("No information found on dates page")
     return result
Ejemplo n.º 25
0
 def max_sequence(self):
     max_recs = None
     response = self.br.open(self._search_url)
     to_date = date.today() - timedelta(days=14)
     fields = {
         self._ref_field: '',
         self._date_field: to_date.strftime(self._request_date_format)
     }
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br)
     html, url = self._get_html(response)
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         num_recs = 0
         for i in result['records']:
             try:
                 num = int(i['uid'])
                 if num > num_recs:
                     num_recs = num
             except:
                 pass
         self.logger.debug('Number of records %d' % num_recs)
         if num_recs > 0:
             max_recs = num_recs
     return max_recs
Ejemplo n.º 26
0
 def get_html_from_uid (self, uid):
     if self._start_url:
         response = self.br.open(self._start_url)
     response = self.br.open(self._search_url)
     #self.logger.debug("search html: %s", response.read())
     #self.logger.debug(scrapeutils.list_forms(self.br))
     fields = {}
     fields.update(self._search_fields)
     if uid.isdigit():
         fields[self._ref_field] = uid
     else:
         fields[self._alt_ref_field] = uid
     scrapeutils.setup_form(self.br, self._ref_search_form, fields)
     #self.logger.debug("Uid form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br, self._ref_search_submit)
     html, url = self._get_html(response)
     # note return here can be a single uid match page OR list of multiple matches
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         if len(result['records']) >= 1:
             fields = {}
             fields.update(self._search_fields)
             scrapeutils.setup_form(self.br, self._ref_search_form, fields)
             #self.logger.debug("Uid form: %s", str(self.br.form))
             response = scrapeutils.submit_form(self.br, self._result_submit)
             return self._get_html(response)
         return None, None
     else:
         return html, url
Ejemplo n.º 27
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        #response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s" % response.read())

        # fix buggy option list
        #html = response.get_data()
        #html = html.replace('<option value="7">8</option>', '<option value="7">7</option> <option value="8">8</option>')
        #response.set_data(html)
        #self.br.set_response(response)

        fields = {}
        fields.update(self._search_fields)
        dfrom = date_from.strftime('X%d/%B/%Y').replace('X0',
                                                        'X').replace('X', '')
        date_parts = dfrom.split('/')
        fields[self._date_from_field['day']] = date_parts[0]
        fields[self._date_from_field['month']] = date_parts[1]
        fields[self._date_from_field['year']] = date_parts[2]
        dto = date_to.strftime('X%d/%B/%Y').replace('X0', 'X').replace('X', '')
        date_parts = dto.split('/')
        fields[self._date_to_field['day']] = date_parts[0]
        fields[self._date_to_field['month']] = date_parts[1]
        fields[self._date_to_field['year']] = date_parts[2]
        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br)

        url = self._results_url + '?' + urllib.urlencode(fields)
        #self.logger.debug("Result url: %s" % url)
        response = self.br.open(url)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                response = self.br.follow_link(text=self._link_next)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Ejemplo n.º 28
0
    def get_html_from_uid(self, uid):
        response = self.br.open(self._search_url)
        if self._first_search:  # launch search facility page with button appears only on first opening of this url
            scrapeutils.setup_form(self.br)
            response = scrapeutils.submit_form(self.br)
            self._first_search = False
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._ref_search)
        fields.update(self._ref_page)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("Choose search form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        fields = {}
        fields.update(self._ref_page)
        fields[self._appno_field] = uid
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("Appno form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._submit_control)
        html, url = self._get_html(response)
        #self.logger.debug("Result html: %s", html)
        result = scrapemark.scrape(self._scrape_ids_ref, html, url)
        if result and result.get('records'):
            self._clean_ids(result['records'])
            for r in result['records']:
                if r.get('uid', '') == uid and r.get('control'):
                    self.logger.debug("Scraped control: %s", r['control'])
                    fields = {r['control']: uid}
                    scrapeutils.setup_form(self.br, self._search_form, fields)
                    #self.logger.debug("Detail form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br)
                    return self._get_html(response)
        return None, None
Ejemplo n.º 29
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        return final_result
Ejemplo n.º 30
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        fields = {}
        fields.update(self._search_fields)
        date_from = date_from.strftime(self._request_date_format)
        date_parts = date_from.split('/')
        fields[self._date_from_field['day']] = date_parts[0]
        fields[self._date_from_field['month']] = date_parts[1]
        fields[self._date_from_field['year']] = date_parts[2]
        date_to = date_to.strftime(self._request_date_format)
        date_parts = date_to.split('/')
        fields[self._date_to_field['day']] = date_parts[0]
        fields[self._date_to_field['month']] = date_parts[1]
        fields[self._date_to_field['year']] = date_parts[2]
        self.logger.debug("Fields: %s", str(fields))
        query = urllib.urlencode(fields)
        url = self._result_url + '?' + query
        response = self.br.open(url)

        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        return final_result