def get_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} for td in tr.findAll('td'): text = utils.get_tag_contents(td) if text: reobj = re.search('\s+vs\s+', text, re.IGNORECASE) if reobj: caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)', text, re.IGNORECASE) if caseReobj and caseReobj.end() < reobj.start(): groupdict = caseReobj.groupdict() metainfo['caseno'] = u'%s/%s' % (groupdict['num'], groupdict['year']) petitioner = text[caseReobj.end():reobj.start()] else: petitioner = text[:reobj.start()] if reobj.end() < len(text): respondent = text[reobj.end():] metainfo['respondent'] = respondent.strip() metainfo['petitioner'] = petitioner.strip() break return metainfo
def save_meta_tags(self, metapath, judgedict, dateobj): tagdict = {} if judgedict.has_key('title'): title = judgedict['title'] tagdict['title'] = title reobj = re.search('( vs | vs\.)', title, re.IGNORECASE) if reobj: if reobj.start() > 1: petitioner = title[:reobj.start()] tagdict['petitioner'] = petitioner if reobj.end() + 1 < len(title): respondent = title[reobj.end() + 1:] tagdict['respondent'] = respondent if judgedict.has_key('bench'): bench = judgedict['bench'].split(',') if len(bench) > 0: benchdict = {} benchdict['name'] = [] for judge in bench: benchdict['name'].append(judge) tagdict['bench'] = benchdict tagdict['date'] = utils.date_to_xml(dateobj) utils.print_tag_file(metapath, tagdict)
def parse_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') i = 0 for td in tds: c = utils.get_tag_contents(td) if c: if i == 0: contents = utils.tag_contents_without_recurse(td) names = [] for content in contents: reobj = re.search('JUSTICE ', content) if reobj: names.append(content[reobj.end():]) if names: metainfo['bench'] = {} metainfo['bench']['name'] = names elif i == 1: metainfo['category'] = c elif i == 3: metainfo['caseno'] = c i += 1 return metainfo
def get_meta_info(self, title, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} reobj = re.search('Dated', title) if reobj: title = title[:reobj.start()] metainfo['caseno'] = title return metainfo
def download_doc(self, baseurl, info, relpath): judgeurl = urllib.basejoin(baseurl, info['href']) filename = info['href'].split('/')[-1] filename = u' '.join(filename.split()) filename = re.sub('/|&|\(|\)', '-', filename) relurl = os.path.join(relpath, filename) info['date'] = utils.date_to_xml(info['date']) return self.save_judgment(relurl, judgeurl, info)
def get_meta_info(self, link, tr, dateobj): metainfo = {'date':utils.date_to_xml(dateobj)} metainfo['caseno'] = utils.get_tag_contents(link) tds = tr.findAll('td') for td in tds: contents = utils.get_tag_contents(td) reobj = re.search('JUSTICE ', contents) if reobj: metainfo['author'] = contents[reobj.end():] return metainfo
def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder): tds = tr.findAll('td') viewIndex = fieldOrder['view'] dateIndex = fieldOrder['date'] if viewIndex >= len(tds) or dateIndex >= len(tds): self.logger.warning(u'Could not get date or view in tr: %s' % tr) return None viewTd = tds[viewIndex] dateTd = tds[dateIndex] datestr = utils.get_tag_contents(dateTd) if not datestr: self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr)) return None subdateobj = utils.datestr_to_obj(datestr) if not subdateobj: self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr)) return None subdateobj = subdateobj.date() metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin} # store bench in metainfo if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds): benchIndex = fieldOrder['bench'] benchTd = tds[benchIndex] contents = utils.get_tag_contents(benchTd) if contents: names = [] for reobj in re.finditer('JUSTICE ', contents): names.append(contents[reobj.end():]) if names: metainfo['bench'] = {} metainfo['bench']['name'] = names # store isJudgment in metainfo if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds): jTd = tds[fieldOrder['judgment']] contents = utils.get_tag_contents(jTd) if contents: metainfo['judgment'] = contents onclick = viewTd.get('onclick') if onclick: relurl = self.download_order(relpath, subdateobj, \ metainfo, onclick) return relurl else: self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd) return None
def get_meta_info(self, tds, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore') title = tds[0].contents[2].encode('ascii', 'ignore') petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent return metainfo
def get_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') if len(tds) >= 3: metainfo['caseno'] = utils.get_tag_contents(tds[2]) if len(tds) >= 4: metainfo['petitioner'] = utils.get_tag_contents(tds[3]) if len(tds) >= 5: metainfo['respondent'] = utils.get_tag_contents(tds[4]) return metainfo
def get_meta_info(self, tds, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo["caseno"] = tds[0].contents[0].encode("ascii", "ignore") title = tds[0].contents[2].encode("ascii", "ignore") petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo["petitioner"] = petitioner if respondent: metainfo["respondent"] = respondent return metainfo
def get_meta_info(self, tds, dateobj): metainfo = {'date': utils.date_to_xml(dateobj) } if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore') title = tds[0].contents[2].encode('ascii', 'ignore') petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent return metainfo
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = { 'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon+1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def parse_result_page(self, posturl, webpage, dateobj): judgments = [] d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse result page %s' % dateobj) return judgments # get judgments trs = d.findAll('tr') for tr in trs: judgment = {} metainfo = {'date': utils.date_to_xml(dateobj)} links = tr.findAll('a') for link in links: href = link.get('href') if href and re.search('WebShowJudgment.do', href): t = utils.get_tag_contents(link) colon = t.find(':') if colon: title = t[colon + 1:] title = title.strip() metainfo['title'] = title reobj = re.search(' vs\. ', title, re.IGNORECASE) if reobj: metainfo['petitioner'] = title[:reobj.start()] metainfo['respondent'] = title[reobj.end():] if href and re.search('WebDownloadJudgmentDocument.do', href): judgment['link'] = urllib.basejoin(posturl, href) if judgment: judgment['metainfo'] = metainfo judgments.append(judgment) # next link links = d.findAll('a') for link in links: t = utils.get_tag_contents(link) if re.search('Next', t): href = link.get('href') if href: judgment = {'link': urllib.basejoin(posturl, href)} judgment['next'] = True judgments.append(judgment) return judgments
def get_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj) } tds = tr.findAll('td') for td in tds: content = utils.get_tag_contents(td) reobj = re.search(' vs\.? ', content, re.IGNORECASE) if reobj: metainfo['title'] = content metainfo['petitioner'] = content[:reobj.start()] metainfo['respondent'] = content[reobj.end():] reobj = re.search('justice ', content, re.IGNORECASE) if reobj: metainfo['author'] = content[reobj.end():] return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') for td in tds: content = utils.get_tag_contents(td) reobj = re.search(' vs\.? ', content, re.IGNORECASE) if reobj: metainfo['title'] = content metainfo['petitioner'] = content[:reobj.start()] metainfo['respondent'] = content[reobj.end():] reobj = re.search('justice ', content, re.IGNORECASE) if reobj: metainfo['author'] = content[reobj.end():] return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} tds = tr.findAll("td") for td in tds: content = utils.get_tag_contents(td) reobj = re.search(" vs\.? ", content, re.IGNORECASE) if reobj: metainfo["title"] = content metainfo["petitioner"] = content[: reobj.start()] metainfo["respondent"] = content[reobj.end() :] reobj = re.search("justice ", content, re.IGNORECASE) if reobj: metainfo["author"] = content[reobj.end() :] return metainfo
def get_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') i = 0 for td in tds: txt = utils.get_tag_contents(td) if txt: reobj = re.search(' vs ', txt, re.IGNORECASE) if reobj: petitioner = string.strip(txt[:reobj.start()], ' \r\n-') respondent = string.strip(txt[reobj.end():], ' \r\n-') if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent elif i == 2: metainfo['caseno'] = txt i += 1 return metainfo
def parse_meta_info(self, tr, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} i = 0 for td in tr.findAll("td"): contents = utils.get_tag_contents(td) if i == 1: metainfo["caseno"] = contents elif i == 3: reobj = re.search(" vs\.? ", contents, re.IGNORECASE) if reobj: metainfo["petitioner"] = contents[: reobj.start()] metainfo["respondent"] = contents[reobj.end() :] elif i == 4: reobj = re.search("JUSTICE ", contents) if reobj: metainfo["author"] = contents[reobj.end() :] i += 1 return metainfo
def parse_meta_info(self, tr, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj)} i = 0 for td in tr.findAll('td'): contents = utils.get_tag_contents(td) if i == 1: metainfo['caseno'] = contents elif i == 3: reobj = re.search(' vs\.? ', contents, re.IGNORECASE) if reobj: metainfo['petitioner'] = contents[:reobj.start()] metainfo['respondent'] = contents[reobj.end():] elif i == 4: reobj = re.search('JUSTICE ', contents) if reobj: metainfo['author'] = contents[reobj.end():] i += 1 return metainfo
def parse_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} i = 0 for td in tr.findAll('td'): contents = utils.get_tag_contents(td) if i == 1: metainfo['caseno'] = contents elif i == 3: reobj = re.search(' vs\.? ', contents, re.IGNORECASE) if reobj: metainfo['petitioner'] = contents[:reobj.start()] metainfo['respondent'] = contents[reobj.end():] elif i == 4: reobj = re.search('JUSTICE ', contents) if reobj: metainfo['author'] = contents[reobj.end():] i += 1 return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') i = 0 for td in tds: txt = utils.get_tag_contents(td) if txt: reobj = re.search(' vs ', txt, re.IGNORECASE) if reobj: petitioner = string.strip(txt[:reobj.start()], ' \r\n-') respondent = string.strip(txt[reobj.end():], ' \r\n-') if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent elif i == 2: metainfo['caseno'] = txt i += 1 return metainfo
def get_meta_info(self, d, dateobj): metainfo = { 'date': utils.date_to_xml(dateobj) } trs = d.findAll('tr') for tr in trs: tds = tr.findAll('td') i = 0 tdtype = None for td in tds[:-1]: content = utils.get_tag_contents(td) if re.search('Case Number', content): tdtype = 'caseno' break if re.search('Judge', content): tdtype = 'author' break if re.search('Petitioner', content): tdtype = 'petitioner' break if re.search('Respondent', content): tdtype = 'respondent' break if re.search('Location', content): tdtype = 'location' break i += 1 if tdtype and i + 1 < len(tds): content = utils.get_tag_contents(td) metainfo[tdtype] = utils.get_tag_contents(tds[i+1]) return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} tds = tr.findAll("td") i = 0 for td in tds: content = utils.get_tag_contents(td) if i == 1: metainfo["caseno"] = content elif i == 2: metainfo["petitioner"] = content metainfo["title"] = content elif i == 3: metainfo["respondent"] = content if metainfo.has_key("title"): metainfo["title"] += " " + content else: metainfo["title"] = content i += 1 return metainfo
def get_meta_info(self, tr, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} tds = tr.findAll('td') i = 0 for td in tds: content = utils.get_tag_contents(td) if i == 1: metainfo['caseno'] = content elif i == 2: metainfo['petitioner'] = content metainfo['title'] = content elif i == 3: metainfo['respondent'] = content if metainfo.has_key('title'): metainfo['title'] += ' ' + content else: metainfo['title'] = content i += 1 return metainfo
def get_meta_info(self, d, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} trs = d.findAll("tr") for tr in trs: tds = tr.findAll("td") i = 0 tdtype = None for td in tds[:-1]: content = utils.get_tag_contents(td) if re.search("Case Number", content): tdtype = "caseno" break if re.search("Judge", content): tdtype = "author" break if re.search("Petitioner", content): tdtype = "petitioner" break if re.search("Respondent", content): tdtype = "respondent" break if re.search("Location", content): tdtype = "location" break i += 1 if tdtype and i + 1 < len(tds): content = utils.get_tag_contents(td) metainfo[tdtype] = utils.get_tag_contents(tds[i + 1]) return metainfo
def save_meta_tags(self, metapath, debatedict, dateobj): tagdict = {'date': utils.date_to_xml(dateobj)} for k in debatedict.keys(): if k not in ['href']: tagdict[k] = debatedict[k] utils.print_tag_file(metapath, tagdict)
def save_meta_tags(self, metapath, judgedict, dateobj): tagdict = {'date': utils.date_to_xml(dateobj)} for k in judgedict.keys(): if k not in [self.HREF]: tagdict[k] = judgedict[k] utils.print_tag_file(metapath, tagdict)