def get_judgment_info(self, tr): judgedict = {} link = tr.find('a') if link: title = utils.get_tag_contents(link) href = link.get('href') if href: judgedict['href'] = href tds = tr.findAll('td') i = 0 for td in tds: tdContent = utils.get_tag_contents(td) if tdContent: if i == 0: judgedict['casetype'] = tdContent elif i == 1: judgedict['caseno'] = tdContent elif i == 2: judgedict['caseyear'] = tdContent i += 1 if judgedict.has_key('caseno') and judgedict.has_key('caseyear'): title = u'%sof%s' % (judgedict['caseno'], judgedict['caseyear']) judgedict['title'] = title return judgedict
def process_result_row(self, tr, metainfos, dateobj, order): download = None for link in tr.find_all('a'): txt = utils.get_tag_contents(link) if txt and re.match('\s*Download', txt, re.IGNORECASE): download = link.get('href') break if not download: return metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) metainfo['download'] = download i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gznum': metainfo['gznum'] = txt.splitlines()[0] elif col in ['subject', 'department', 'notification_num', \ 'gztype']: metainfo[col] = txt i += 1
def get_meta_info(self, link, tr, dateobj): metainfo = {'date':utils.date_to_xml(dateobj)} metainfo['caseno'] = utils.get_tag_contents(link) tds = tr.findAll('td') for td in tds: contents = utils.get_tag_contents(td) reobj = re.search('JUSTICE ', contents) if reobj: metainfo['author'] = contents[reobj.end():] return metainfo
def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder): tds = tr.findAll('td') viewIndex = fieldOrder['view'] dateIndex = fieldOrder['date'] if viewIndex >= len(tds) or dateIndex >= len(tds): self.logger.warning(u'Could not get date or view in tr: %s' % tr) return None viewTd = tds[viewIndex] dateTd = tds[dateIndex] datestr = utils.get_tag_contents(dateTd) if not datestr: self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr)) return None subdateobj = utils.datestr_to_obj(datestr) if not subdateobj: self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr)) return None subdateobj = subdateobj.date() metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin} # store bench in metainfo if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds): benchIndex = fieldOrder['bench'] benchTd = tds[benchIndex] contents = utils.get_tag_contents(benchTd) if contents: names = [] for reobj in re.finditer('JUSTICE ', contents): names.append(contents[reobj.end():]) if names: metainfo['bench'] = {} metainfo['bench']['name'] = names # store isJudgment in metainfo if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds): jTd = tds[fieldOrder['judgment']] contents = utils.get_tag_contents(jTd) if contents: metainfo['judgment'] = contents onclick = viewTd.get('onclick') if onclick: relurl = self.download_order(relpath, subdateobj, \ metainfo, onclick) return relurl else: self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd) return None
def result_page(self, webpage, relpath, dateobj): newdls = [] if not webpage: return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error( u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: continue href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.info(u'Could not process %s' % link) continue if not re.match('\d+$', title) and not re.search( 'PREV|NEXT', title): self.logger.info(u'link: %s title: %s' % (href, title)) rel = self.handle_judgment_link(relpath, tr, dateobj, href, title) if rel: newdls.append(rel) if newdls: links = d.findAll('a') for link in links: href = link.get('href') title = utils.get_tag_contents(link) if title and href and re.match('NEXT', title): self.logger.info(u'Following next page link: %s' % link) webpage = self.download_url(urllib.basejoin(self.baseurl,href),\ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj)) return newdls
def get_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') if len(tds) >= 3: metainfo['caseno'] = utils.get_tag_contents(tds[2]) if len(tds) >= 4: metainfo['petitioner'] = utils.get_tag_contents(tds[3]) if len(tds) >= 5: metainfo['respondent'] = utils.get_tag_contents(tds[4]) return metainfo
def datepage_metainfos(self, url, dateobj): minfos = [] response = self.download_url(url) if not response or not response.webpage: self.logger.warn('Unable to download %s. Skipping', url) return minfos d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Unable to parse %s. Skipping.', url) return minfos partnum = None dept = None for td in d.find_all('td'): bgcolor = td.get('bgcolor') links = td.find_all('a') if bgcolor == '#91BAE8' and len(links) == 0: partnum = utils.get_tag_contents(td) partnum = utils.remove_spaces(partnum) dept = None elif len(links) > 0: reobj = re.compile('^(strong|a)$') for x in td.find_all(reobj): if x.name == 'strong': dept = utils.get_tag_contents(x) dept = utils.remove_spaces(dept) elif x.name == 'a' and partnum: href = x.get('href') if not href.startswith('pdf'): continue title = utils.get_tag_contents(x) title = utils.remove_spaces(title) metainfo = utils.MetaInfo() minfos.append(metainfo) metainfo.set_title(title) metainfo.set_date(dateobj) metainfo['partnum'] = partnum if dept: metainfo['department'] = dept gzurl = urllib.basejoin(url, href) metainfo['url'] = gzurl return minfos
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = {'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon + 1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = { 'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon+1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def get_field_order(self, tr): i = 0 order = [] valid = False for th in tr.find_all('th'): txt = utils.get_tag_contents(th) if txt and re.search('gazette\s+type', txt, re.IGNORECASE): order.append('gztype') elif txt and re.search('department', txt, re.IGNORECASE): order.append('department') elif txt and re.search('abstract', txt, re.IGNORECASE): order.append('subject') elif txt and re.search('Issue\s+No', txt, re.IGNORECASE): order.append('gznum') elif txt and re.search('Notification\s+No', txt, re.IGNORECASE): order.append('notification_num') elif txt and re.search('Download', txt, re.IGNORECASE): order.append('download') valid = True elif txt and re.search('', txt, re.IGNORECASE): order.append('') else: order.append('') i += 1 if valid: return order return None
def parse_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): txt = utils.get_tag_contents(td) if i < len(order) and txt: txt = txt.strip() col = order[i] if col == 'gztype': words = txt.split('/') metainfo['gztype'] = words[0].strip() if len(words) > 1: metainfo['partnum'] = words[1].strip() if len(words) > 2: metainfo['district'] = words[2].strip() elif col == 'download': inp = td.find('input') if inp and inp.get('onclick'): metainfo['download'] = inp.get('onclick') elif col in ['notification_num', 'gznum', 'department']: metainfo[col] = txt elif col == 'subject': metainfo.set_subject(txt) i += 1 return metainfo
def get_judgment_info(self, tr): judgedict = {} if tr.findAll("table"): return {} link = tr.find("a") if link: href = link.get("href") if href: judgedict["href"] = href tds = tr.findAll("td") i = 0 caseno = "" for td in tds: i += 1 txt = utils.get_tag_contents(td) reobj = re.search("JUSTICE ", txt) if reobj: author = txt[reobj.end() :] if author: judgedict["author"] = author elif i == 2: judgedict["casetype"] = txt elif i == 3: caseno += txt elif i == 4: caseno += "/%s" % txt if caseno: judgedict["caseno"] = caseno judgedict["title"] = caseno return judgedict
def get_order_of_fields(self, table): fieldOrder = {} thead = table.find('thead') if not thead: return fieldOrder ths = thead.findAll('th') i = 0 for th in ths: text = utils.get_tag_contents(th) if text: if re.search('CASEDETAIL', text): fieldOrder['caseno'] = i elif re.search('JUDGE NAME', text): fieldOrder['bench'] = i elif re.search('DATE', text): fieldOrder['date'] = i elif re.search('VIEW', text): fieldOrder['view'] = i elif re.search('JUDGEMENT', text): fieldOrder['judgment'] = i i += 1 return fieldOrder
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gztype': metainfo.set_gztype(txt) elif col == 'download': link = td.find('a') if link: href = link.get('href') if href: metainfo['download'] = href elif col in ['partnum', 'division', 'subject']: metainfo[col] = txt i += 1 if 'download' not in metainfo: self.logger.warn('No download link, ignoring: %s', tr) else: metainfos.append(metainfo)
def process_result_row(self, tr, metainfos, dateobj, order): tds = tr.find_all('td') if len(tds) != len(order): return metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tds: if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gztype': metainfo.set_gztype(txt) elif col == 'gznum': metainfo['gznum'] = txt link = td.find('a') if link and link.get('href'): metainfo['download'] = link.get('href') i += 1
def parse_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') i = 0 for td in tds: c = utils.get_tag_contents(td) if c: if i == 0: contents = utils.tag_contents_without_recurse(td) names = [] for content in contents: reobj = re.search('JUSTICE ', content) if reobj: names.append(content[reobj.end():]) if names: metainfo['bench'] = {} metainfo['bench']['name'] = names elif i == 1: metainfo['category'] = c elif i == 3: metainfo['caseno'] = c i += 1 return metainfo
def process_result_page(self, relpath, dateobj, webpage): newdls = [] d = utils.parse_webpage(webpage) if not d: self.logger.info(u'Could not parse result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: pagetype = self.page_type(tr) if pagetype == 'nextlink': nextlink = self.next_link(tr.findAll('a')) if nextlink: self.logger.info(u'Going to the next page: %s' \ % utils.get_tag_contents(nextlink)) rels = self.process_next_link(relpath, dateobj, nextlink) newdls.extend(rels) elif pagetype == 'judgment': rel = self.handle_judgment_link(relpath, dateobj, tr) if rel: newdls.append(rel) else: self.logger.info(u'Not processing %s' % tr) return newdls
def handle_result_page(self, resultpage, relpath, dateobj): dls = [] d = utils.parse_webpage(resultpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) # download judgments trs = d.findAll('tr') for tr in trs: links = tr.findAll('a') if len(links) == 1: relurl = self.dl_judgment(relpath, tr, links[0], dateobj) if relurl: dls.append(relurl) else: self.logger.warning(u'No action for %s' % tr) # next page links = d.findAll('a') for link in links: href = link.get('href') t = utils.get_tag_contents(link) if href and t == 'Next': nexturl = urllib.basejoin(self.resulturl, href) resultpage = self.download_url(nexturl, \ loadcookies = self.cookiefile.name) if resultpage: self.logger.info(u'Recursing to %s' % nexturl) dls.extend(self.handle_result_page(resultpage, relpath, \ dateobj)) return dls
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'ministry': metainfo.set_ministry(txt) elif col == 'subject': metainfo.set_subject(txt) elif col == 'gztype': metainfo.set_gztype(txt) elif col == 'download': inp = td.find('input') if inp: name = inp.get('name') if name: metainfo[col] = name else: link = td.find('a') if link: metainfo[col] = link elif col in ['office', 'department', 'partnum', 'refnum']: metainfo[col] = txt i += 1
def get_judgment_info(self, tr): judgedict = {} if tr.findAll('table'): return {} link = tr.find('a') if link: href = link.get('href') if href: judgedict['href'] = href tds = tr.findAll('td') i = 0 caseno = '' for td in tds: i += 1 txt = utils.get_tag_contents(td) reobj = re.search('JUSTICE ', txt) if reobj: author = txt[reobj.end():] if author: judgedict['author'] = author elif i == 2: judgedict['casetype'] = txt elif i == 3: caseno += txt elif i == 4: caseno += '/%s' % txt if caseno: judgedict['caseno'] = caseno judgedict['title'] = caseno return judgedict
def download_info_page(self, url): webpage = self.download_url(url) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse the date search page') return [], None links = d.findAll('a') infolist = [] previousurl = None for link in links: href = link.get('href') if previousurl == None and href: anchortext = utils.get_tag_contents(link) if anchortext and re.search('Previous >>', anchortext): previousurl = urllib.basejoin(url, href) if href: if re.match('judgements', href): node = link while node.name != 'tr': node = node.parent if node: metainfo = self.get_meta_info(node) metainfo['href'] = href infolist.append(metainfo) self.logger.debug('metainfo: %s' % metainfo) return infolist, previousurl
def get_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} for td in tr.findAll('td'): text = utils.get_tag_contents(td) if text: reobj = re.search('\s+vs\s+', text, re.IGNORECASE) if reobj: caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)', text, re.IGNORECASE) if caseReobj and caseReobj.end() < reobj.start(): groupdict = caseReobj.groupdict() metainfo['caseno'] = u'%s/%s' % (groupdict['num'], groupdict['year']) petitioner = text[caseReobj.end():reobj.start()] else: petitioner = text[:reobj.start()] if reobj.end() < len(text): respondent = text[reobj.end():] metainfo['respondent'] = respondent.strip() metainfo['petitioner'] = petitioner.strip() break return metainfo
def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') for link in tr.findAll('a'): href = link.get('href') if href: metainfo['href'] = href break if not metainfo.has_key('href'): return {} i = 0 for td in tds: value = utils.get_tag_contents(td) if value: if i == 0: metainfo[self.CASENO] = value elif i == 1: pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet if res: metainfo[self.RESPONDENT] = res elif i == 2: dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj i += 1 return metainfo
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfo.set_gztype(self.gazette_type) metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'subject': metainfo.set_subject(txt) elif col == 'gznum': reobj = re.search('\w+', txt) if reobj: metainfo['gznum'] = txt[reobj.start():reobj.end()] elif col == 'notification_date': d = utils.parse_datestr(txt) if d: metainfo[col] = d elif col in ['department', 'notification_num']: metainfo[col] = txt elif col == 'download': inp = tr.find('input') if inp: name = inp.get('name') if name: metainfo[col] = name i += 1
def get_debate_info(self, table): info = {} trs = table.findAll('tr') for tr in trs: tds = tr.findAll('td') if len(tds) == 2: hl = utils.get_tag_contents(tds[0]) value = utils.get_tag_contents(tds[1]) hltype = self.get_headline_type(hl) if hltype: info[hltype] = value if hltype == 'title': href = self.get_link(tds[1]) if href: info['href'] = href return info
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() i = 0 for td in tr.find_all('td'): txt = utils.get_tag_contents(td) if txt and order[i] in ['subject', 'department', 'gznum']: txt, n = re.subn('\s+', ' ', txt) metainfo[order[i]] = txt.strip() elif txt and order[i] == 'date': nums = re.split('[./-]+', txt) if len(nums) < 3: self.logger.warn( 'Couldn\'t get date from %s for extraordinary gazette list', txt) i += 1 continue nums = [re.subn('\s+', '', n)[0] for n in nums] nums = [n for n in nums if n] d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0])) try: metainfo.set_date(d) except: self.logger.warn('Could not parse date %s', txt) i += 1 if metainfo.get_date() == dateobj: metainfo.set_gztype('Extraordinary') return metainfo return None
def process_judgment_page(self, relpath, url, dateobj): webpage = self.download_url(url, loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return None d = utils.parse_webpage(webpage) if not d: self.logger.warning(u'Could not parse %s' % url) return None metainfo = self.get_meta_info(d, dateobj) for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue action = self.action_on_link(href, title) newurl = urllib.basejoin(url, href) if action == 'save': self.logger.info(u'Downloading %s' % newurl) return self.get_judgment(relpath, newurl, title, metainfo) return None
def result_page(self, relpath, url, dateobj, linkdict): newdls = [] webpage = self.download_url(url, loadcookies = self.cookiefile.name) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title) or linkdict.has_key(href): self.logger.warning(u'Could not process %s' % link) continue linkdict[href] = 1 action = self.action_on_link(href, title) self.logger.info(u'Action %s on link %s title %s' %\ (action, href, title)) newurl = urllib.basejoin(url, href) if action == 'judgmentlink': relurl = self.process_judgment_page(relpath, newurl, dateobj) if relurl: newdls.append(relurl) else: self.logger.warning(u'Judgment link not working %s' % newurl) elif action == 'recurse': newdls.extend(self.result_page(relpath, newurl, dateobj, linkdict)) return newdls
def find_field_order(self, tr): order = [] for td in tr.find_all('td'): txt = utils.get_tag_contents(td) if txt and re.search('Department', txt): order.append('department') elif txt and re.search('Notification\s+Number', txt): order.append('notification_num') elif txt and re.search('Gazette\s+Number', txt): order.append('gznum') elif txt and re.search('Subject', txt): order.append('subject') elif txt and re.search('File', txt): order.append('download') elif txt and re.search('Gazette\s+Date', txt): order.append('gzdate') else: order.append('') for field in [ 'department', 'download', 'subject', 'gznum', 'notification_num' ]: if field not in order: return None return order
def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') i = 0 lastcolumn = len(tds) - 1 for td in tds: content = utils.get_tag_contents(td) if content: if i == 1: content = u' '.join(content.split()) metainfo['caseno'] = content elif i == 2: petitioner, respondent = \ utils.get_petitioner_respondent(content) if petitioner: metainfo['petitioner'] = petitioner else: self.logger.info(u'Petitioner not found in %s' % content) if respondent: metainfo['respondent'] = respondent elif i == lastcolumn: dateobj = utils.datestr_to_obj(content) if dateobj: metainfo[self.DATE] = dateobj else: self.logger.info(u'No date in %s' % (content)) i += 1 return metainfo
def process_result_row(self, tr, metainfos, dateobj, order): gznum = None i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'gznum': gznum = txt elif col.startswith('partnum'): h, partnum = col.split('|') metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) metainfo.set_gztype(self.gazette_type) if gznum: metainfo['gznum'] = gznum metainfo['partnum'] = partnum inp = td.find('input') if inp: name = inp.get('name') if name: metainfo['download'] = name i += 1
def get_judgment_info(self, tr): links = tr.findAll('a') judgedict = {} for link in links: href = link.get('href') if href and re.search('imgst.aspx', href): judgedict['href'] = urllib.basejoin(self.webformUrl, href) tds = tr.findAll('td') maxTxt = '' for td in tds: txt = utils.get_tag_contents(td) if not txt: continue txt = txt.strip() reobj = re.search('Coram\s*:', txt) if reobj and reobj.end() + 1 < len(txt): bench = txt[reobj.end() + 1:] judgedict['bench'] = bench else: reobj = re.search(' vs\.? ', txt, re.IGNORECASE) if reobj: judgedict['title'] = txt judgedict['petitioner'] = txt[:reobj.start()] judgedict['respondent'] = txt[reobj.end():] elif len(maxTxt) < len(txt): maxTxt = txt if not judgedict.has_key('title') and maxTxt: judgedict['title'] = maxTxt return judgedict
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: txt = utils.get_tag_contents(td) txt = txt.strip() if order[i] in [ 'gznum', 'department', 'notification_num', 'subject' ]: metainfo[order[i]] = txt elif order[i] == 'gzdate': nums = re.findall('\d+', txt) if len(nums) == 3: try: d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0])) metainfo['gzdate'] = d except: self.logger.warn('Unable to form date for %s', txt) elif order[i] == 'download': link = td.find('a') if link and link.get('href'): metainfo['href'] = link.get('href') i += 1 if 'href' in metainfo and 'gznum' in metainfo: return metainfo return None
def parse_metainfos(self, webpage, year, fromdate, todate): minfos = [] nextpage = None d = utils.parse_webpage(webpage, self.parser) if not d: self.logger.warn('Unable to parse results page for year %d', year) return minfos for td in d.find_all('td'): link = td.find('a') if link == None: continue img = td.find('img') if img: title = img.get('title') if title == 'Next' and nextpage == None: nextpage = link continue metainfo = self.get_metainfo(link, td) if metainfo: dateobj = metainfo.get_date() if dateobj and dateobj >= fromdate and dateobj <= todate: minfos.append(metainfo) paras = td.find_all('p') if len(paras) >= 2: p = paras[1] txt = utils.get_tag_contents(p) reobj = re.search( 'Department:\s*(?P<dept>.+)\s+Order\s+Nos:\s*(,Othres\s*:)?(?P<ordernum>.*)', txt) if reobj: groupdict = reobj.groupdict() ordernum = groupdict['ordernum'].strip() metainfo['department'] = groupdict['dept'].strip() if re.match('[\d+(,\s*)?]+$', ordernum): metainfo['ordernum'] = ordernum if len(paras) >= 3: p = paras[2] txt = utils.get_tag_contents(p) if txt: metainfo.set_subject(txt) return minfos, nextpage
def page_type(self, tr): text = utils.get_tag_contents(tr) if re.search(' vs ', text, re.IGNORECASE): return 'judgment' elif self.next_link(tr.findAll('a')): return 'nextlink' else: return 'unknown'
def download_oneday(self, relpath, dateobj): dateurl = urllib.basejoin(self.baseurl, '/hcjudge/date_output.php') postdata = [('d1', dateobj.day), ('m1', dateobj.month), \ ('y1', dateobj.year), ('d2', dateobj.day), \ ('m2', dateobj.month), ('y2', dateobj.year), \ ('button', 'Submit')] webpage = self.download_url(dateurl, postdata = postdata) if not webpage: self.logger.warning(u'No webpage for %s date: %s' % \ (dateurl, dateobj)) return [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'HTML parsing failed for date: %s' % dateobj) return [] newdls = [] for link in d.findAll('a'): href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.warning(u'Could not process %s' % link) continue words = href.split('/') filename = words[-1] url = urllib.basejoin(dateurl, href) self.logger.info(u'link: %s title: %s' % (href, title)) relurl = os.path.join (relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): webpage = self.download_url(url) if not webpage: self.logger.warning(u'No webpage %s' % url) else: utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % url) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(title, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def result_page(self, webpage, relpath, dateobj): newdls = [] if not webpage: return newdls d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: continue href = link.get('href') title = utils.get_tag_contents(link) if (not href) or (not title): self.logger.info(u'Could not process %s' % link) continue if not re.match('\d+$', title) and not re.search('PREV|NEXT',title): self.logger.info(u'link: %s title: %s' % (href, title)) rel = self.handle_judgment_link(relpath, tr, dateobj, href, title) if rel: newdls.append(rel) if newdls: links = d.findAll('a') for link in links: href = link.get('href') title = utils.get_tag_contents(link) if title and href and re.match('NEXT', title): self.logger.info(u'Following next page link: %s' % link) webpage = self.download_url(urllib.basejoin(self.baseurl,href),\ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj)) return newdls
def get_next_page(self, d, baseurl): nextPage = None for link in d.findAll('a'): value = utils.get_tag_contents(link) href = link.get('href') if href and value and re.search('\s*Next', value): nextPage = urllib.basejoin(baseurl, href) return nextPage
def get_judgment_info(self, tr): judgedict = {} link = tr.find('a') if link: title = utils.get_tag_contents(link) href = link.get('href') if title: judgedict['title'] = title if href: judgedict['href'] = href tds = tr.findAll('td') for td in tds: txt = utils.get_tag_contents(td) reobj = re.search('Coram\s*:', txt) if reobj and reobj.end() + 1 < len(txt): bench = txt[reobj.end() + 1:] judgedict['bench'] = bench return judgedict
def get_meta_info(self, tr): tds = tr.findAll('td') metainfo = {} link = tr.find('a') if link: href = link.get('href') if href: metainfo['href'] = href else: return metainfo valueList = [] for td in tds: value = utils.get_tag_contents(td) valueList.append(value) i = 0 for value in valueList: i += 1 if value: value = value.strip() if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER): pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet metainfo[self.CASENO] = valueList[i-1] if res: metainfo[self.RESPONDENT] = res elif metainfo.has_key(self.PETITIONER): dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj # try one more heuristics if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): dateobj = utils.datestr_to_obj(metainfo['href']) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE) and \ not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner/date found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.DATE): self.logger.info(u'No date found: %s %s' % \ (metainfo, valueList)) return metainfo
def get_meta_info(self, d, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj) } trs = d.findAll('tr') for tr in trs: tds = tr.findAll('td') i = 0 tdtype = None for td in tds[:-1]: content = utils.get_tag_contents(td) if re.search('Case Number', content): tdtype = 'caseno' break if re.search('Judge', content): tdtype = 'author' break if re.search('Petitioner', content): tdtype = 'petitioner' break if re.search('Respondent', content): tdtype = 'respondent' break if re.search('Location', content): tdtype = 'location' break i += 1 if tdtype and i + 1 < len(tds): content = utils.get_tag_contents(td) metainfo[tdtype] = utils.get_tag_contents(tds[i+1]) return metainfo
def get_meta_info(self, d, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} trs = d.findAll("tr") for tr in trs: tds = tr.findAll("td") i = 0 tdtype = None for td in tds[:-1]: content = utils.get_tag_contents(td) if re.search("Case Number", content): tdtype = "caseno" break if re.search("Judge", content): tdtype = "author" break if re.search("Petitioner", content): tdtype = "petitioner" break if re.search("Respondent", content): tdtype = "respondent" break if re.search("Location", content): tdtype = "location" break i += 1 if tdtype and i + 1 < len(tds): content = utils.get_tag_contents(td) metainfo[tdtype] = utils.get_tag_contents(tds[i + 1]) return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} tds = tr.findAll("td") for td in tds: content = utils.get_tag_contents(td) reobj = re.search(" vs\.? ", content, re.IGNORECASE) if reobj: metainfo["title"] = content metainfo["petitioner"] = content[: reobj.start()] metainfo["respondent"] = content[reobj.end() :] reobj = re.search("justice ", content, re.IGNORECASE) if reobj: metainfo["author"] = content[reobj.end() :] return metainfo
def get_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj) } tds = tr.findAll('td') for td in tds: content = utils.get_tag_contents(td) reobj = re.search(' vs\.? ', content, re.IGNORECASE) if reobj: metainfo['title'] = content metainfo['petitioner'] = content[:reobj.start()] metainfo['respondent'] = content[reobj.end():] reobj = re.search('justice ', content, re.IGNORECASE) if reobj: metainfo['author'] = content[reobj.end():] return metainfo
def result_page(self, webpage, relpath, dateobj, linkdict): newdls = [] if not webpage: return newdls courtParser = utils.parse_webpage(webpage) if not courtParser: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = courtParser.findAll('tr') for tr in trs: link = tr.find('a') if link: title = utils.get_tag_contents(link) href = link.get('href') if (not title) or (not href): self.logger.warning(u'Could not process %s' % link) continue if linkdict.has_key(href): continue if not re.search('first|prev|next|last|acroread', title, \ re.IGNORECASE): linkdict[href] = 1 dl = self.handle_link(relpath, href, title, tr, dateobj) if dl: newdls.append(dl) elif title == 'Next': self.logger.info(u'Following Next page %s' % href) newlink = urllib.basejoin (self.baseurl, href) webpage = self.download_url(newlink, \ loadcookies = self.cookiefile.name) newdls.extend(self.result_page(webpage, relpath, dateobj, \ linkdict)) else: self.logger.info(u'No action for %s' % href) return newdls