def get_html_from_uid (self, uid):
     if self._start_url:
         response = self.br.open(self._start_url)
     response = self.br.open(self._search_url)
     #self.logger.debug("search html: %s", response.read())
     #self.logger.debug(scrapeutils.list_forms(self.br))
     fields = {}
     fields.update(self._search_fields)
     if uid.isdigit():
         fields[self._ref_field] = uid
     else:
         fields[self._alt_ref_field] = uid
     scrapeutils.setup_form(self.br, self._ref_search_form, fields)
     #self.logger.debug("Uid form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br, self._ref_search_submit)
     html, url = self._get_html(response)
     # note return here can be a single uid match page OR list of multiple matches
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         if len(result['records']) >= 1:
             fields = {}
             fields.update(self._search_fields)
             scrapeutils.setup_form(self.br, self._ref_search_form, fields)
             #self.logger.debug("Uid form: %s", str(self.br.form))
             response = scrapeutils.submit_form(self.br, self._result_submit)
             return self._get_html(response)
         return None, None
     else:
         return html, url
    def get_html_from_uid(self, uid):
        
        response = self.br.open(self._search_url)
        self._adjust_response(response)
        #self.logger.debug("Start page html: %s", response.read())
        
        # get first brief application page
        fields = {}
        fields.update(self._search_fields_applic)
        fields ['ctl00$sideBar$sdcAppSearch$ddlCaseType'] = uid[0:2]
        fields ['ctl00$sideBar$sdcAppSearch$ddlCaseYear'] = uid[3:5]
        fields ['ctl00$sideBar$sdcAppSearch$txtCaseNo'] = uid[6:11]
        scrapeutils.setup_form(self.br, self._search_form, fields)
        for control in self.br.form.controls:
            if control.type == "submit" or control.type == "image":
                control.disabled = True
        self.logger.debug("First applic form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        self._adjust_response(response)
        #self.logger.debug("First page: %s", response.read())

        # get second detailed application page
        scrapeutils.setup_form(self.br, self._search_form, self._detail_fields)
        for control in self.br.form.controls:
            if control.type == "submit" or control.type == "image":
                control.disabled = True
        self.logger.debug("Detail applic form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        self._adjust_response(response)
        html, url = self._get_html(response)
        #self.logger.debug("Detail page: %s", html)
        return html, url
Example #3
0
    def get_html_from_uid(self, uid):
        response = self.br.open(self._search_url)
        if self._first_search:  # launch search facility page with button appears only on first opening of this url
            scrapeutils.setup_form(self.br)
            response = scrapeutils.submit_form(self.br)
            self._first_search = False
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._ref_search)
        fields.update(self._ref_page)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("Choose search form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        fields = {}
        fields.update(self._ref_page)
        fields[self._appno_field] = uid
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("Appno form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._submit_control)
        html, url = self._get_html(response)
        #self.logger.debug("Result html: %s", html)
        result = scrapemark.scrape(self._scrape_ids_ref, html, url)
        if result and result.get('records'):
            self._clean_ids(result['records'])
            for r in result['records']:
                if r.get('uid', '') == uid and r.get('control'):
                    self.logger.debug("Scraped control: %s", r['control'])
                    fields = {r['control']: uid}
                    scrapeutils.setup_form(self.br, self._search_form, fields)
                    #self.logger.debug("Detail form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br)
                    return self._get_html(response)
        return None, None
Example #4
0
    def get_id_batch (self, date_from, date_to):

        this_dt = date_from
        final_result = []
        
        while this_dt <= date_to:
            
            response = self.br.open(self._search_url)
            #self.logger.debug("Start html: %s", response.read())
    
            fields = {}
            fields[self._date_from_field] = this_dt.strftime(self._request_date_format)
            fields[self._date_to_field] = this_dt.strftime(self._request_date_format)
            scrapeutils.setup_form(self.br, self._search_form, fields)
            #self.logger.debug("ID batch form: %s", str(self.br.form))
            response = scrapeutils.submit_form(self.br)
            
            html = response.read()
            #self.logger.debug("ID batch page html: %s", html)
            try:
                result = scrapemark.scrape(self._scrape_max_recs, html)
                max_recs = int(result['max_recs'])
            except:
                max_recs = 0
            
            interim_result = []
            page_count = 0
            max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
            while response and len(interim_result) < max_recs and page_count < max_pages:
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    for rec in result['records']:
                        rec[self._date_type] = fields[self._date_to_field]
                    self._clean_ids(result['records'])
                    interim_result.extend(result['records'])
                else:
                    self.logger.debug("Empty result after %d pages", page_count)
                    break
                if len(interim_result) >= max_recs: break
                try:
                    result = scrapemark.scrape(self._scrape_next_submit, html)
                    next_submit = result['next_submit']
                    scrapeutils.setup_form(self.br, self._search_form)
                    self.logger.debug("ID next form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br, next_submit)
                    html = response.read()
                except: # failure to find next page link at end of page sequence here
                    self.logger.debug("No next form link after %d pages", page_count)
                    break
                    
            if page_count >= max_pages:
                self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
            final_result.extend(interim_result)
            this_dt += timedelta(days=1)
            
        return final_result
Example #5
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s" % response.read())

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0  # note max recs is in the footer which is omitted if only one page of results

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            if max_recs == 0:  # one page, no footer
                result = scrapemark.scrape(self._scrape_ids_no_foot, html, url)
            else:
                result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if max_recs == 0 or len(final_result) >= max_recs: break
            try:
                result = scrapemark.scrape(self._scrape_next_submit, html)
                scrapeutils.setup_form(self.br, self._search_form)
                #self.logger.debug("Next page form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br,
                                                   result['next_submit'])
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #6
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url + '?advanced_search=true')
        scrapeutils.setup_form(self.br, self._search_form)
        #response = scrapeutils.submit_form(self.br, self._advanced_submit)
        #self.logger.debug("Start html: %s" % response.read())

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        #html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        #try:
        #    result = scrapemark.scrape(self._scrape_max_recs, html)
        #    max_recs = int(result['max_recs'])
        #except:
        #    max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        #while response and len(final_result) < max_recs and page_count < max_pages:
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            #if len(final_result) >= max_recs: break
            try:
                scrapeutils.setup_form(self.br, self._search_form,
                                       self._next_fields)
                #self.logger.debug("Next page form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #7
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        self._adjust_response(response)
        #self.logger.debug("First page html: %s", response.read())

        self.logger.debug(scrapeutils.list_forms(self.br))

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        action = self.br.form.action
        self.br.form.action = action.replace('https://', 'http://')
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                if 'next_link' in self._scrape_next:
                    result = scrapemark.scrape(self._scrape_next, html, url)
                    response = self.br.open(result['next_link'])
                else:
                    scrapeutils.setup_form(self.br, self._scrape_next)
                    action = self.br.form.action
                    self.br.form.action = action.replace('https://', 'http://')
                    self.logger.debug("ID next form: %s", str(self.br.form))
                    response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form/link after %d pages",
                                  page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #8
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("ID batch start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields['p_object_name'] = self._form_name + self._form_object_suffix
        fields[self._form_name +
               self._date_from_field_suffix] = date_from.strftime(
                   self._request_date_format)
        fields[self._form_name +
               self._date_to_field_suffix] = date_to.strftime(
                   self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s" % str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                scrapeutils.setup_form(self.br, self._next_form,
                                       self._next_page_fields)
                self.logger.debug("Next form: %s" % str(self.br.form))
                response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #9
0
 def max_sequence(self):
     max_recs = None
     response = self.br.open(self._search_url)
     to_date = date.today() - timedelta(days=14)
     fields = {
         self._ref_field: '',
         self._date_field: to_date.strftime(self._request_date_format)
     }
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br)
     html, url = self._get_html(response)
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         num_recs = 0
         for i in result['records']:
             try:
                 num = int(i['uid'])
                 if num > num_recs:
                     num_recs = num
             except:
                 pass
         self.logger.debug('Number of records %d' % num_recs)
         if num_recs > 0:
             max_recs = num_recs
     return max_recs
Example #10
0
 def get_html_from_uid (self, uid):
     if self._uid_match.match(uid): # all numbers or /
         #fields = {  self._uid_field: uid }
         url = self._applic_url + urllib.quote_plus(uid)
         return self.get_html_from_url(url)
     else:
         fields = {  self._ref_field: uid }
     response = self.br.open(self._search_url)
     #self.logger.debug("ID detail start html: %s", response.read())
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br)
     html, url = self._get_html(response)
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records']) 
         if len(result['records']) == 1 and result['records'][0].get('url'):
             url = result['records'][0]['url']
             self.logger.debug("Scraped url: %s", url)
             return self.get_html_from_url(url)
         else:
             for r in result['records']:
                 if r.get('uid', '') == uid and r.get('url'):
                     url = r['url']
                     self.logger.debug("Scraped url: %s", url)
                     return self.get_html_from_url(url)
     return None, None
Example #11
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        dfrom = date_from.strftime('X%d/%b/%Y').replace('X0',
                                                        'X').replace('X', '')
        date_parts = dfrom.split('/')
        fields[self._date_from_field['day']] = [date_parts[0]]
        fields[self._date_from_field['month']] = [date_parts[1]]
        fields[self._date_from_field['year']] = [date_parts[2]]
        dto = date_to.strftime('X%d/%b/%Y').replace('X0', 'X').replace('X', '')
        date_parts = dto.split('/')
        fields[self._date_to_field['day']] = [date_parts[0]]
        fields[self._date_to_field['month']] = [date_parts[1]]
        fields[self._date_to_field['year']] = [date_parts[2]]
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        return final_result
Example #12
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        response = self.br.open(self._search_url)

        fields = {}
        fields.update(self._search_fields)
        date_from = date_from.strftime(self._request_date_format)
        date_parts = date_from.split('/')
        #fields[self._date_from_field['day']] = [ date_parts[0] ]
        #fields[self._date_from_field['month']] = [ date_parts[1] ]
        #fields[self._date_from_field['year']] = [ date_parts[2] ]
        fields[self._date_from_field['day']] = date_parts[0]
        fields[self._date_from_field['month']] = date_parts[1]
        fields[self._date_from_field['year']] = date_parts[2]
        date_to = date_to.strftime(self._request_date_format)
        date_parts = date_to.split('/')
        #fields[self._date_to_field['day']] = [ date_parts[0] ]
        #fields[self._date_to_field['month']] = [ date_parts[1] ]
        #fields[self._date_to_field['year']] = [ date_parts[2] ]
        fields[self._date_to_field['day']] = date_parts[0]
        fields[self._date_to_field['month']] = date_parts[1]
        fields[self._date_to_field['year']] = date_parts[2]
        #scrapeutils.setup_form(self.br, self._search_form, fields)
        #self.logger.debug("ID batch form: %s", str(self.br.form))
        #response = scrapeutils.submit_form(self.br, self._search_submit)

        response = self.br.open(self._search_url, urllib.urlencode(fields))

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            url = response.geturl()
            html = response.read()
            self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                scrapeutils.setup_form(self.br, self._next_form,
                                       self._next_fields)
                self.logger.debug("ID next form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #13
0
 def get_html_from_uid(
     self, uid
 ):  # note gives 500 error if do not handle Referer explicitly as below (also see S. Lanark)
     response = self.br.open(self._search_url)
     self._adjust_response(response)
     #self.logger.debug("ID detail start html: %s", response.read())
     fields = self._applic_fields
     fields[self._ref_field] = uid
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br, self._search_submit)
     html, url = self._get_html(response)
     sub_html = self._BADCHARS_REGEX.sub(' ', html)
     #self.logger.debug("Detail page html: %s", sub_html)
     result = scrapemark.scrape(self._scrape_ids, sub_html, url)
     #print result
     if result and result.get('records'):
         self._clean_ids(result['records'])
         for r in result['records']:
             if r.get('uid', '') == uid and r.get('url'):
                 self.logger.debug("Scraped url: %s", r['url'])
                 headers = {}
                 headers.update(self._headers)
                 headers['Referer'] = url
                 self.br.addheaders = headers.items()
                 return self.get_html_from_url(r['url'])
     return None, None
Example #14
0
 def get_html_from_uid(self, uid):
     response = self.br.open(self._search_url)
     #self.logger.debug("ID detail start html: %s", response.read())
     fields = self._applic_fields
     fields[self._ref_field] = uid
     scrapeutils.setup_form(self.br, self._search_form, fields)
     response = scrapeutils.submit_form(self.br, self._search_submit)
     html = response.read()
     sub_html = self._BADCHARS_REGEX.sub(' ', html)
     #self.logger.debug("detail page html: %s", sub_html)
     expired = scrapemark.scrape(self._scrape_expired, sub_html)
     while expired:
         response = self.br.reload()
         html = response.read()
         sub_html = self._BADCHARS_REGEX.sub(' ', html)
         expired = scrapemark.scrape(self._scrape_expired, sub_html)
     url = response.geturl()
     result = scrapemark.scrape(self._scrape_ids, sub_html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         for r in result['records']:
             if r.get('uid', '') == uid and r.get('url'):
                 self.logger.debug("Scraped url: %s", r['url'])
                 return self.get_html_from_url(r['url'])
     return None, None
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        return final_result
Example #16
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        for case in self._case_prefixes:

            interim_result = []
            response = self.br.open(self._search_url)
            #self.logger.debug("Start html: %s", response.read())

            fields = {self._ref_field: case}
            fields[self._date_from_field] = date_from.strftime(
                self._request_date_format)
            fields[self._date_to_field] = date_to.strftime(
                self._request_date_format)
            scrapeutils.setup_form(self.br, self._search_form, fields)
            self.logger.debug("ID batch form: %s", str(self.br.form))
            response = scrapeutils.submit_form(self.br)

            page_count = 0
            max_pages = (2 * self.min_id_goal /
                         10) + 20  # guard against infinite loop
            while response and page_count < max_pages:
                html = response.read()
                url = response.geturl()
                #self.logger.debug("ID batch page html: %s", html)
                result = scrapemark.scrape(self._scrape_ids, html, url)
                if result and result.get('records'):
                    page_count += 1
                    self._clean_ids(result['records'])
                    interim_result.extend(result['records'])
                elif not interim_result:  # is it a single record?
                    single_result = scrapemark.scrape(self._scrape_one_id,
                                                      html, url)
                    if single_result:
                        self._clean_record(single_result)
                        interim_result = [single_result]
                        break
                else:
                    self.logger.debug("Empty result after %d pages",
                                      page_count)
                    break
                try:
                    result = scrapemark.scrape(self._scrape_next_link, html,
                                               url)
                    response = self.br.open(result['next_link'])
                except:
                    self.logger.debug("No next link after %d pages",
                                      page_count)
                    break

            if page_count >= max_pages:
                self.logger.warning(
                    "Too many page requests - %d - probable run away loop" %
                    page_count)

            final_result.extend(interim_result)

        return final_result
Example #17
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        for control in self.br.form.controls:
            if control.name == "dateaprecv_date:FROM:DATE":
                control.disabled = True
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        """html = response.read()
        self.logger.debug("ID batch page html: %s", html)
        result = scrapemark.scrape(self._scrape_max_pages, html)
        try:
            page_list = result['max_pages'].split()
            max_pages = len(page_list)
        except:
            max_pages = 1"""

        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        page_count = 0
        while response and page_count < max_pages:
            html = response.read()
            #self.logger.debug("ID batch page html: %s", html)
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if page_count >= max_pages: break
            try:
                next_url = re.sub(r'pageno=\d*&',
                                  'pageno=' + str(page_count + 1) + '&', url)
                self.logger.debug("ID next url: %s", next_url)
                response = self.br.open(next_url)
                #html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next url after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #18
0
 def get_html_from_uid(self, uid):
     response = self.br.open(self._search_url)
     #self.logger.debug("ID detail start html: %s", response.read())
     fields = {self._ref_field: uid}
     scrapeutils.setup_form(self.br, self._search_form, fields)
     #self.logger.debug("Get UID form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br, self._search_submit)
     return self._get_html(response)
    def get_id_batch(self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        id_list = []
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                # add IDs one by one and test for duplicates
                for r in result['records']:
                    if r['uid'] not in id_list:
                        final_result.append(r)
                        id_list.append(r['uid'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                response = self.br.follow_link(text=self._next_link)
                html = response.read()
                #self.logger.debug("ID next page html: %s", html)
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #20
0
 def get_html_from_uid(self, uid):
     response = self.br.open(self._disclaimer_url)
     scrapeutils.setup_form(self.br, self._search_form)
     #self.logger.debug("Disclaimer form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br)
     url = urlparse.urljoin(
         self._search_url,
         self._detail_page) + '?AppNo=' + urllib.quote_plus(uid)
     return self.get_html_from_url(url)
Example #21
0
 def get_html_from_uid (self, uid):
     response = self.br.open(self._search_url)
     scrapeutils.setup_form(self.br, self._search_form)
     self.logger.debug("Start form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br)
     #self.logger.debug("ID detail start html: %s", response.read())
     fields = { self._ref_field: uid }
     scrapeutils.setup_form(self.br, self._search_form, fields)
     self.logger.debug("Get UID form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br, self._ref_submit)
     html, url = self._get_html(response)
     result = scrapemark.scrape(self._scrape_ids, html, url)
     if result and result.get('records'):
         self._clean_ids(result['records'])
         for rr in result['records']:
             if rr.get('uid', '') == uid and rr.get('url'):
                 return self.get_html_from_url(rr['url'])
     return None, None 
Example #22
0
    def get_id_batch(self, date_from, date_to):

        final_result = []
        new_date_from = date_from - timedelta(
            days=1)  # start date is exclusive, decrement start date by one day
        date_to = date_to + timedelta(
            days=1)  # end date is exclusive, increment end date by one day
        response = self.br.open(self._search_url)
        #self.logger.debug("Start html: %s", response.read())

        fields = {}
        fields[self._date_from_field] = new_date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br, self._search_submit)

        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and len(
                final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                result = scrapemark.scrape(self._scrape_next, html, url)
                response = self.br.open(result['next_link'])
                html = response.read()
            except:  # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
Example #23
0
 def _get_exact_html_from_uid(self, uid):
     if self._start_url:
         response = self.br.open(self._start_url)
     response = self.br.open(self._search_url)
     #self.logger.debug("ID detail start html: %s", response.read())
     self.logger.debug(scrapeutils.list_forms(self.br))
     fields = {}
     fields.update(self._search_fields)
     fields[self._ref_field] = uid
     if self._ref_form:
         scrapeutils.setup_form(self.br, self._ref_form, fields)
     else:
         scrapeutils.setup_form(self.br, self._search_form, fields)
     self.logger.debug("Uid form: %s", str(self.br.form))
     if self._ref_submit:
         response = scrapeutils.submit_form(self.br, self._ref_submit)
     else:
         response = scrapeutils.submit_form(self.br, self._search_submit)
     return self._get_html(response)
Example #24
0
    def get_id_batch (self, date_from, date_to):

        final_result = []
        response = self.br.open(self._search_url)
        
        fields = {}
        fields.update(self._search_fields)
        dfrom = date_from.strftime('X%d/%B/%Y').replace('X0','X').replace('X','')
        date_parts = dfrom.split('/')
        fields[self._date_from_field['day']] = [ date_parts[0] ]
        fields[self._date_from_field['month']] = [ date_parts[1] ]
        fields[self._date_from_field['year']] = [ date_parts[2] ]
        dto = date_to.strftime('X%d/%B/%Y').replace('X0','X').replace('X','')
        date_parts = dto.split('/')
        fields[self._date_to_field['day']] = [ date_parts[0] ]
        fields[self._date_to_field['month']] = [ date_parts[1] ]
        fields[self._date_to_field['year']] = [ date_parts[2] ]
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)
        
        #html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        #try:
        #    result = scrapemark.scrape(self._scrape_max_recs, html)
        #    max_recs = int(result['max_recs'])
        #except:
        #    max_recs = 0
        
        page_count = 0
        max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
        #while response and len(final_result) < max_recs and page_count < max_pages:
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            #if len(final_result) >= max_recs: break
            try:
                response = self.br.follow_link(text=self._link_next)
            except: # normal failure to find next page form at end of page sequence here
                self.logger.debug("No next link after %d pages", page_count)
                break
        
        if page_count >= max_pages:
            self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
        return final_result
Example #25
0
    def get_id_batch(self, date_from, date_to):

        new_date_to = date_to + timedelta(
            days=1)  # increment end date by one day
        final_result = []
        response = self.br.open(self._search_url)
        #self.logger.debug("ID batch start html: %s", response.read())

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = new_date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        page_count = 0
        max_pages = (2 * self.min_id_goal /
                     10) + 20  # guard against infinite loop
        while response and page_count < max_pages:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                for res in result['records']:
                    if res.get('uid'):  # one uid on 1 dec 2015 is empty
                        final_result.append(res)
            elif not final_result:  # is it a single record?
                single_result = scrapemark.scrape(self._scrape_one_id, html,
                                                  url)
                if single_result:
                    self._clean_record(single_result)
                    final_result = [single_result]
                    break
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            try:
                result = scrapemark.scrape(self._scrape_next_link, html, url)
                response = self.br.open(result['next_link'])
            except:
                self.logger.debug("No next link after %d pages", page_count)
                break

        if page_count >= max_pages:
            self.logger.warning(
                "Too many page requests - %d - probable run away loop" %
                page_count)

        return final_result
    def get_id_batch(self, date_from, date_to):  # note end date is exclusive

        final_result = []
        response = self.br.open(self._search_url)
        self._adjust_response(response)
        #self.logger.debug("ID batch start html: %s", response.read())

        new_date_to = date_to + timedelta(
            days=1)  # end date is exclusive, increment end date by one day
        fields = {}
        fields.update(self._search_fields)
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = new_date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        #try:
        #    result = scrapemark.scrape(self._scrape_max_recs, html)
        #    max_recs = int(result['max_recs'])
        #except:
        #    max_recs = 1

        #self.logger.debug("Max recs: %d", max_recs)
        page_count = 0
        #max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
        #while response and len(final_result) < max_recs and page_count < max_pages:
        if response:
            html = response.read()
            url = response.geturl()
            #self.logger.debug("ID batch page html: %s", html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                #page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            #else:
            #    self.logger.debug("Empty result after %d pages", page_count)
            #    break
            #if len(final_result) >= max_recs: break
            #try:
            #    result = scrapemark.scrape(self._scrape_next_link, html, url)
            #    response = self.br.open(result['next_link'])
            #    html = response.read()
            #except:
            #    self.logger.debug("No next link after %d pages", page_count)
            #    break

        #if page_count >= max_pages:
        #    self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)

        return final_result
Example #27
0
    def get_id_batch(self, date_from, date_to):

        final_result = []

        response = self.br.open(self._start_url)

        scrapeutils.setup_form(self.br, self._search_form, self._start_fields)
        self.logger.debug("ID start form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        #response = self.br.open(self._search_url)

        fields = {}
        fields[self._date_from_field] = date_from.strftime(
            self._request_date_format)
        fields[self._date_to_field] = date_to.strftime(
            self._request_date_format)
        scrapeutils.setup_form(self.br, self._search_form, fields)
        self.logger.debug("ID batch form: %s", str(self.br.form))
        response = scrapeutils.submit_form(self.br)

        #response = self.br.open(self._direct_url, urllib.urlencode(fields))

        if response:

            html = response.read()
            self.logger.debug("ID batch page html: %s", html)
            try:
                result = scrapemark.scrape(self._scrape_max_recs, html)
                max_recs = int(result['max_recs'])
            except:
                max_recs = 0

            url = response.geturl()
            #self.logger.debug("Batch html: %s" % html)
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                self._clean_ids(result['records'])
                final_result.extend(result['records'])

        return final_result
Example #28
0
 def _get_html_from_uid(self, uid):
     # note return here can be a single uid match page OR no match
     # however a list of multiple matches is an error
     response = self.br.open(self._search_url)
     #self.logger.debug("Start html: %s" % response.read())
     fields = {}
     fields.update(self._search_fields)
     fields[self._ref_field] = uid
     scrapeutils.setup_form(self.br, self._search_form, fields)
     self.logger.debug("ID ref form: %s", str(self.br.form))
     #response = scrapeutils.submit_form(self.br, self._ref_submit)
     response = scrapeutils.submit_form(self.br)
     return self._get_html(response)
    def get_id_batch (self, date_from, date_to):

        final_result = []
        
        fields = {}
        fields.update(self._search_fields)
        fields [self._date_from_field] = date_from.strftime(self._request_date_format)
        fields [self._date_to_field] = date_to.strftime(self._request_date_format)
        
        self.logger.debug("Fields: %s", str(fields))
        query = urllib.urlencode(fields)
        url = urlparse.urljoin(self._search_url, self._results_page) + '?' + query
        response = self.br.open(url)
        
        html = response.read()
        #self.logger.debug("ID batch page html: %s", html)
        try:
            result = scrapemark.scrape(self._scrape_max_recs, html)
            max_recs = int(result['max_recs'])
        except:
            max_recs = 0
        
        page_count = 0
        max_pages = (2 * self.min_id_goal / 10) + 20 # guard against infinite loop
        while response and len(final_result) < max_recs and page_count < max_pages:
            url = response.geturl()
            result = scrapemark.scrape(self._scrape_ids, html, url)
            if result and result.get('records'):
                page_count += 1
                self._clean_ids(result['records'])
                final_result.extend(result['records'])
            else:
                self.logger.debug("Empty result after %d pages", page_count)
                break
            if len(final_result) >= max_recs: break
            try:
                fields = { '__EVENTTARGET': self._next_target }
                fields['__EVENTARGUMENT'] = 'Page$' + str(page_count+1)
                scrapeutils.setup_form(self.br, self._search_form, fields)
                self.logger.debug("Next page form: %s", str(self.br.form))
                response = scrapeutils.submit_form(self.br)
                html = response.read()
                #self.logger.debug("ID next page html: %s", html)
            except: # normal failure to find next page link at end of page sequence here
                self.logger.debug("No next form after %d pages", page_count)
                break
        
        if page_count >= max_pages:
            self.logger.warning("Too many page requests - %d - probable run away loop" % page_count)
            
        return final_result
Example #30
0
 def get_id_records (self, request_from, request_to, max_recs):
     if not request_from or not request_to or not max_recs:
         return [], None, None # if any parameter invalid - try again next time
     final_result = []
     from_rec = int(request_from)
     to_rec = int(request_to)
     num_recs = int(max_recs)
     if from_rec < 1:
         if to_rec < 1: # both too small
             return [], None, None
         from_rec = 1
     if to_rec > num_recs:
         if from_rec > num_recs: # both too large
             return [], None, None
         to_rec = num_recs
         
     response = self.br.open(self._disclaimer_url) 
     scrapeutils.setup_form(self.br, self._disclaimer_form)
     #self.logger.debug("Disclaimer form: %s", str(self.br.form))
     response = scrapeutils.submit_form(self.br)
     
     #page_zero = self._get_results_pages ('OpenForConsultation=True')
     #n_old = max_recs - len(page_zero)
     #print 'n_old', n_old
     #if current_result and from_rec > n_old:
     #    cfrom_rec = from_rec - n_old
     #    cto_rec = to_rec - n_old
     #    return current_result[cfrom_rec-1:cto_rec], from_rec, to_rec
     
     max_page, min_rec = self._find_max_pages(from_rec)
     
     #print 'mp', max_page, min_rec
     for d in self._districts:
         interim_result = self._get_results_pages ('District=' + d, max_page)
         if interim_result:
             #print d, len(interim_result)
             final_result.extend(interim_result)
         else:
             #print 'Empty'
             return [], None, None # list scraper - so individual empty result is also invalid
     
     if final_result:
         #print 'x', len(final_result)
         fret = sorted(final_result, key=lambda k: (k['pageno'], k['recno'], k['uid']), reverse=True)
         #self.logger.debug("From: %d To: %d" % (from_rec, to_rec))
         new_fret = fret[from_rec-min_rec:to_rec-min_rec+1]
         for f in new_fret:
             del f['pageno']; del f['recno']
         return new_fret, from_rec, to_rec
     else:
         return [], None, None # list scraper - so empty result is always invalid