def download_oneday(self, relpath, dateobj): dls = [] datestr = utils.dateobj_to_str(dateobj, '-') searchurl = self.searchurl % (datestr, datestr) response = self.download_url(searchurl) if not response or not response.webpage: self.logger.warn('Could not download search result for date %s', \ dateobj) return dls d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Could not parse search result for date %s', \ dateobj) return dls minfos = self.parse_results(d, dateobj) for metainfo in minfos: if 'download' not in metainfo: self.logger.warn('No link. Ignoring metainfo: %s', metainfo) continue relurl = self.download_gazette(metainfo, searchurl, relpath) if relurl: dls.append(relurl) return dls
def get_date_url(self, dateobj): qs = [ \ ('datef', utils.dateobj_to_str(dateobj, '-', reverse = True)), \ ('datet', utils.dateobj_to_str(dateobj, '-', reverse = True)), \ ('selfday', utils.pad_zero(dateobj.day)), \ ('selfmonth', utils.pad_zero(dateobj.month)), \ ('selfyear', utils.pad_zero(dateobj.year)), \ ('seltday', utils.pad_zero(dateobj.day)), \ ('seltmonth', utils.pad_zero(dateobj.month)), \ ('seltyear', utils.pad_zero(dateobj.year)), \ ('B1', 'Search') \ ] query = string.join(['%s=%s' % (q[0], q[1]) for q in qs], '&') dateurl = self.courturl + 'dojqry.asp' + '?' + query return dateurl
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '/') postdata = [] for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image' or name == 'ctl00$CPH$btnReset': continue if name == 'ctl00$CPH$txtToDate' or \ name == 'ctl00$CPH$txtfromDate': value = datestr elif name == 'ctl00$CPH$btnSearch': value = 'Search' elif tag.name == 'select': name = tag.get('name') if name == 'ctl00$CPH$ddldivision': value = '-----Select----' elif name == 'ctl00$CPH$ddlSection': value = '-----Select-----' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def download_oneday(self, relpath, dateobj): getdata = [('type', 'datecreated'), ('order', 'ASC'), ('rpp', '20'), \ ('value', utils.dateobj_to_str(dateobj, '-', reverse = True))] url = self.courturl + 'browse?' + \ '&'.join(['%s=%s' % (x[0], x[1]) for x in getdata]) return self.result_page(relpath, url, dateobj, {})
def download_oneday(self, relpath, dateobj): getdata = [ ("type", "datecreated"), ("order", "ASC"), ("rpp", "20"), ("value", utils.dateobj_to_str(dateobj, "-", reverse=True)), ] url = self.courturl + "browse?" + "&".join(["%s=%s" % (x[0], x[1]) for x in getdata]) return self.result_page(relpath, url, dateobj, {})
def get_post_data(self, dateobj): curr_date = utils.dateobj_to_str(datetime.date.today(), '/') datestr = utils.dateobj_to_str(dateobj, '') postdata = [\ ('displaytable_length', '-1'), \ ('mode', 'unspecified'), \ ('property(abstract)', '' ), \ ('property(docid)', ''), \ ('property(fromdate)', datestr), \ ('property(gazetteno)', ''), \ ('property(hdnCurDate)', curr_date), \ ('property(jobno)', ''), \ ('property(month1)', '0'), \ ('property(search)', 'searchGazette'), \ ('property(searchmode)', 'date'), \ ('property(todate)', datestr), \ ('property(year)', '0'), \ ('property(year1)', '0'), \ ] return postdata
def get_post_data(self, dateobj): datestr = utils.dateobj_to_str(dateobj, '/') postdata = [\ ('cmb_Cat', '-1'), ('cmb_Name', '-1'), ('cmb_Not_For', '-1'), \ ('ComboDept', '-1'), ('eAttachId', ''), ('freetextradio', 'No'), \ ('NewSearchFlag', 'false'), ('PriorityName', '--Select--'), \ ('refDocId', ''), ('reportEndIndex', '10'), \ ('reportStartIndex', '1'), ('txtEmail', ''), ('txtFreeText', ''), \ ('txtFrom', datestr), ('txtGazetteNo', ''), ('txtNotNo', ''), \ ('txtNotTitle', ''), ('txtTo', datestr), \ ] return postdata
def get_query_tuples(self, dateobj, zone, city): datestr = utils.dateobj_to_str(dateobj, '/') qtuples = [ \ ('subAction', 'showReoprt'), \ ('__report', 'pronouncementOrderReport1_%s.rptdesign' % zone), \ ('City', city), \ ('searchWhat', 'searchByDate'), \ ('Serial No', ''), \ ('Appeal No', ''), \ ('Assessee Name', ''), \ ('AssType', 'null'), \ ('Order Date', datestr), \ ('Member Name', ''), \ ('Pronouncement Date', ''), \ ] return qtuples
def download_oneday(self, relpath, dateobj): newdls = [] datestr = utils.dateobj_to_str(dateobj, '/') subrelpath = '/'.join(relpath.split('/')[:-1]) postdata = [('hcjudgecode', ''), ('fromdate', datestr), \ ('todate', datestr), ('counter', '1')] webpage = self.download_url (self.pageurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name, \ postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s' % self.pageurl) return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: if tr.find('th'): continue onclick = tr.get('onclick') if not onclick: self.logger.info(u'No onclick in %s' % tr) continue reobj = re.search('\d+', onclick) if not reobj: continue ccin = reobj.group(0) webpage = self.download_url (self.caseurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name, \ postdata = [('ccin', ccin)]) if not webpage: self.logger.error(u'Could not get case for %s on date %s' % (ccin, dateobj)) continue newdls.extend(self.download_orders(subrelpath, ccin, dateobj, webpage)) return newdls
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '/') postdata = [] gztype = None for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image' or name in [ 'ctl00$ContentPlaceHolder1$TxtGazetteNo', 'ctl00$ContentPlaceHolder1$BtnCancel' ]: continue if name == 'ctl00$ContentPlaceHolder1$TYPE': if gztype != None: continue else: value = 'RadioButton1' gztype = value if name == 'ctl00$ContentPlaceHolder1$BtnSearch': value = 'Search' if name == 'ctl00$ContentPlaceHolder1$CheckBoxYearAll': value = 'on' if name == 'ctl00$ContentPlaceHolder1$TextBox2' or name == 'ctl00$ContentPlaceHolder1$TextBox1': value = datestr elif tag.name == 'select': name = tag.get('name') if name in ['ctl00$ContentPlaceHolder1$ddlYear']: continue if name == 'ctl00$ContentPlaceHolder1$ddlFilter': value = '1' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def get_post_data(self, dateobj): datestr = utils.dateobj_to_str(dateobj, '') postdata = [\ ('mode', 'unspecified'), \ ('property(abstract)', '' ), \ ('property(department)', '0'), \ ('property(docid)', ''), \ ('property(fromdate)', datestr), \ ('property(gazetteno)', ''), \ ('property(gazettePart)', '0'), \ ('property(gazetteType)', '0'), \ ('property(month1)', '0'), \ ('property(search)', 'search'), \ ('property(todate)', datestr), \ ('property(year1)', '0'), \ ] return postdata
def date_postdata(self, dateobj): currentDate = utils.dateobj_to_str(dateobj, '-') postdata = [('__EVENTTARGET', ''), ('__EVENTARGUMENT', '')] postdata.extend(self.state_data()) otherdata = [\ ('ctl00$ContPlaceHolderMain$TextBox1', ''), \ ('ctl00$ContPlaceHolderMain$search1', 'search'), \ ('ctl00$ContPlaceHolderMain$btn', 'allwordbtn'), \ ('ctl00$ContPlaceHolderMain$btn1', 'textbtn'), \ ('ctl00$ContPlaceHolderMain$ddlmember','--- Select Member Name ---'),\ ('ctl00$ContPlaceHolderMain$ddldebtype','--- Select Debate Type ---'),\ ('ctl00$ContPlaceHolderMain$ddlsession', '--- Select Session ---'), \ ] postdata.extend(otherdata) postdata.append(('ctl00$ContPlaceHolderMain$ddldatefrom', currentDate)) postdata.append(('ctl00$ContPlaceHolderMain$ddldateto', currentDate)) return postdata
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '/', reverse=False) postdata = [] radio_set = False for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image': continue if name == 'BtnElectronicGazette': continue if name == 'RBLanguage' \ and radio_set: continue if name == 'GMDatePicker1$ctl00' or \ name == 'GMGazzetteDate$ctl00': value = datestr elif name == 'RBLanguage': value = 'Both' radio_set = True elif tag.name == 'select': name = tag.get('name') if name == 'BtnSearch': value = 'Search' elif name == 'DDListCategory': value = '' elif name == 'DDListDepartment': value = '' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '-', reverse=True) postdata = [] radio_set = False for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image' or name == 'ctl00$ContentPlaceHolder1$archiveNotification': continue if name == 'ctl00$ContentPlaceHolder1$RadioButtonList1' \ and radio_set: continue if name == 'ctl00$ContentPlaceHolder1$txtstartdate' or \ name == 'ctl00$ContentPlaceHolder1$txtenddate': value = datestr elif name == 'ctl00$ContentPlaceHolder1$Button1': value = 'Submit' elif name == 'ctl00$ContentPlaceHolder1$RadioButtonList1': value = '-1' radio_set = True elif tag.name == 'select': name = tag.get('name') if name == 'BtnSearch': value = 'search' elif name == 'ctl00$ContentPlaceHolder1$ddlGazetteCat': value = '-1' elif name == 'ctl00$ContentPlaceHolder1$ddldepartment': value = '-1' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def download_oneday(self, relpath, dateobj): self.get_cookies() posturl = self.baseurl + '/ordqryrepact_action.php' fromdate = utils.dateobj_to_str(dateobj, '-') todate = fromdate postdata = [('pageno', 1), ('frmaction', ''), ('actcode', 0), \ ('frmdate', fromdate), \ ('todate', todate), ('submit1', 'Submit')] newdls = [] linkdict = {} for sideflag in ['C', 'CR', 'OS', 'NC', 'NR', 'AC', 'AR']: data = postdata[:] data.insert(2, ('m_sideflg', sideflag)) webpage = self.download_url(posturl, postdata = data, \ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj, linkdict)) return newdls
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '/') postdata = [] for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image': continue if name == 'ctl00$ContentPlaceHolder1$TYPE' and not value == 'RadioButton1': continue if name == 'ctl00$ContentPlaceHolder1$BtnCancel': continue if name == 'ctl00$ContentPlaceHolder1$TextBox1' or \ name == 'ctl00$ContentPlaceHolder1$TextBox2': value = datestr elif name == 'ctl00$ContentPlaceHolder1$CheckBoxYearAll': value = 'on' elif tag.name == 'select': name = tag.get('name') if name == 'ctl00$ContentPlaceHolder1$ddlYear': continue elif name == 'ctl00$ContentPlaceHolder1$ddlFilter': value = '1' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '') postdata = [] for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image' or name == 'Button2' or name == 'Button1': continue if name == 'txttodate' or name == 'txtfrmdate': value = datestr elif name in ['jobno', 'txtGoNo', 'txtSearchText']: value = '' elif tag.name == 'select': name = tag.get('name') if name == 'BtnSearch': value = 'search' elif name == 'DDLDeptname': value = 'Select' elif name == 'DDLGoType': value = 'Select' elif name == 'DropDownList1': value = 'Select' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def get_post_data(self, tags, dateobj): datestr = utils.dateobj_to_str(dateobj, '/') postdata = [] for tag in tags: name = None value = None if tag.name == 'input': name = tag.get('name') value = tag.get('value') t = tag.get('type') if t == 'image': continue if name == 'ctl00$ContentPlaceHolder2$btnExit': continue if name == 'ctl00$ContentPlaceHolder2$btnShow': value = value.encode('utf8') if name == 'ctl00$ContentPlaceHolder2$DaintyDate2' or \ name == 'ctl00$ContentPlaceHolder2$DaintyDate1': value = datestr elif tag.name == 'select': name = tag.get('name') if name == 'ctl00$ContentPlaceHolder2$ddlType': value = self.gztype if name == 'ctl00$ContentPlaceHolder2$ddldepart': value = '0' if name: if value == None: value = u'' postdata.append((name, value)) return postdata
def get_post_data(self, dateobj, city, pagenum): datestr = utils.dateobj_to_str(dateobj, '/') postdata = '<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><GetUpdatedObjects xmlns="http://schemas.eclipse.org/birt"><Operation><Target><Id>Document</Id><Type>Document</Type></Target><Operator>GetPage</Operator><Oprand><Name>City</Name><Value>%s</Value></Oprand><Oprand><Name>__isdisplay__City</Name><Value>%s</Value></Oprand><Oprand><Name>Serial No</Name><Value></Value></Oprand><Oprand><Name>__isdisplay__Serial No</Name><Value></Value></Oprand><Oprand><Name>Appeal No</Name><Value></Value></Oprand><Oprand><Name>__isdisplay__Appeal No</Name><Value></Value></Oprand><Oprand><Name>Assessee Name</Name><Value></Value></Oprand><Oprand><Name>__isdisplay__Assessee Name</Name><Value></Value></Oprand><Oprand><Name>searchWhat</Name><Value>searchByDate</Value></Oprand><Oprand><Name>__isdisplay__searchWhat</Name><Value>searchByDate</Value></Oprand><Oprand><Name>Order Date</Name><Value>%s</Value></Oprand><Oprand><Name>__isdisplay__Order Date</Name><Value>%s</Value></Oprand><Oprand><Name>Member Name</Name><Value></Value></Oprand><Oprand><Name>__isdisplay__Member Name</Name><Value></Value></Oprand><Oprand><Name>Pronouncement Date</Name><Value></Value></Oprand><Oprand><Name>__isdisplay__Pronouncement Date</Name><Value></Value></Oprand><Oprand><Name>__page</Name><Value>%d</Value></Oprand><Oprand><Name>__svg</Name><Value>true</Value></Oprand></Operation></GetUpdatedObjects></soap:Body></soap:Envelope>' % (city, city, datestr, datestr, pagenum) return postdata
def download_oneday(self, relpath, dateobj): dls = [] if dateobj >= self.flip_date1: if dateobj >= self.flip_date2: datestr = '%d-%d-%d' % (dateobj.day, dateobj.month, dateobj.year) else: datestr = '%s-%s-%d' % (utils.pad_zero( dateobj.day), utils.pad_zero(dateobj.month), dateobj.year) mainhref = 'Contents-(%s).pdf' % datestr else: datestr = utils.dateobj_to_str(dateobj, '', reverse=True) mainhref = 'Contents(%s-%s-%s).pdf' % (utils.pad_zero( dateobj.day), utils.pad_zero( dateobj.month), utils.pad_zero(dateobj.year % 100)) dateurl = self.baseurl % datestr docurl = urllib.basejoin(dateurl, mainhref) mainmeta = utils.MetaInfo() mainmeta.set_date(dateobj) mainmeta.set_url(self.url_fix(docurl)) response = self.download_url(docurl) if not response or not response.webpage or response.error: return dls mainrelurl = os.path.join(relpath, 'main') updated = False if self.storage_manager.save_rawdoc(self.name, mainrelurl, response.srvresponse, response.webpage): self.logger.info(u'Saved rawfile %s' % mainrelurl) updated = True page_type = self.get_file_extension(response.webpage) if page_type != 'pdf': self.logger.warn( 'Got a non-pdf page and we can\'t handle it for datte %s', dateobj) return dls links = [] linknames = [] hrefs = utils.extract_links_from_pdf(StringIO(response.webpage)) for href in hrefs: reobj = re.search('(?P<num>Part-\w+)', href) if reobj: partnum = reobj.groupdict()['num'] else: partnum = '%s' % href reobj = re.search('.pdf$', partnum) if partnum: partnum = partnum[:reobj.start()] relurl = os.path.join(relpath, partnum) docurl = urllib.basejoin(dateurl, href) metainfo = utils.MetaInfo() metainfo.set_date(dateobj) metainfo['partnum'] = partnum links.append(relurl) linknames.append(partnum) if self.save_gazette(relurl, docurl, metainfo): dls.append(relurl) mainmeta['links'] = links mainmeta['linknames'] = linknames if self.storage_manager.save_metainfo(self.name, mainrelurl, mainmeta): updated = True self.logger.info(u'Saved metainfo %s' % mainrelurl) if updated: dls.append(mainrelurl) return dls
def download_oneday(self, relpath, dateobj): newdls = [] pageurl = urllib.basejoin(self.baseurl, '/gujarathc/') datestr = utils.dateobj_to_str(dateobj, '-') dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \ (datestr, datestr) webpage = self.download_url (dateurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'No webpage for %s' % dateurl) return newdls webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \ self.sanitize_windowopen, webpage) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: self.logger.info(u'No link in %s' % tr) continue href = link.get('onclick') if not href: self.logger.info(u'No href in %s' % tr) continue reobj = re.search("showoj.jsp?[^'\s]+", href) (start, end) = reobj.span() pagerelurl = href[start:end] url = urllib.basejoin(pageurl, pagerelurl) filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \ 'casetype']) if not filename: self.logger.error(u'Could not get filename for %s' % url) continue relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): self.logger.info(u'Downloading %s %s' % (url, filename)) j = self.download_url(url, loadcookies = self.cookiefile.name) if not j: self.logger.warning(u'No webpage: %s' % url) else: self.logger.info(u'Saving %s' % filepath) utils.save_file(filepath, j) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(link, tr, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def date_in_form(self, dateobj): return [('juddt', utils.dateobj_to_str(dateobj, '/'))]