def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') i = 0 lastcolumn = len(tds) - 1 for td in tds: content = utils.get_tag_contents(td) if content: if i == 1: content = u' '.join(content.split()) metainfo['caseno'] = content elif i == 2: petitioner, respondent = \ utils.get_petitioner_respondent(content) if petitioner: metainfo['petitioner'] = petitioner else: self.logger.info(u'Petitioner not found in %s' % content) if respondent: metainfo['respondent'] = respondent elif i == lastcolumn: dateobj = utils.datestr_to_obj(content) if dateobj: metainfo[self.DATE] = dateobj else: self.logger.info(u'No date in %s' % (content)) i += 1 return metainfo
def get_meta_info(self, tr): metainfo = {} tds = tr.findAll('td') for link in tr.findAll('a'): href = link.get('href') if href: metainfo['href'] = href break if not metainfo.has_key('href'): return {} i = 0 for td in tds: value = utils.get_tag_contents(td) if value: if i == 0: metainfo[self.CASENO] = value elif i == 1: pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet if res: metainfo[self.RESPONDENT] = res elif i == 2: dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj i += 1 return metainfo
def get_meta_info(self, tr): tds = tr.findAll('td') metainfo = {} link = tr.find('a') if link: href = link.get('href') if href: metainfo['href'] = href else: return metainfo valueList = [] for td in tds: value = utils.get_tag_contents(td) valueList.append(value) i = 0 for value in valueList: i += 1 if value: value = value.strip() if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER): pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet metainfo[self.CASENO] = valueList[i - 1] if res: metainfo[self.RESPONDENT] = res elif metainfo.has_key(self.PETITIONER): dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj # try one more heuristics if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): dateobj = utils.datestr_to_obj(metainfo['href']) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE) and \ not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner/date found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.DATE): self.logger.info(u'No date found: %s %s' % \ (metainfo, valueList)) return metainfo
def get_meta_info(self, tr): tds = tr.findAll('td') metainfo = {} link = tr.find('a') if link: href = link.get('href') if href: metainfo['href'] = href else: return metainfo valueList = [] for td in tds: value = utils.get_tag_contents(td) valueList.append(value) i = 0 for value in valueList: i += 1 if value: value = value.strip() if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER): pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet metainfo[self.CASENO] = valueList[i-1] if res: metainfo[self.RESPONDENT] = res elif metainfo.has_key(self.PETITIONER): dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj # try one more heuristics if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): dateobj = utils.datestr_to_obj(metainfo['href']) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE) and \ not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner/date found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.PETITIONER): self.logger.info(u'No petitioner found: %s %s' % \ (metainfo, valueList)) elif not metainfo.has_key(self.DATE): self.logger.info(u'No date found: %s %s' % \ (metainfo, valueList)) return metainfo
def get_meta_info(self, tds, dateobj): metainfo = {"date": utils.date_to_xml(dateobj)} if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo["caseno"] = tds[0].contents[0].encode("ascii", "ignore") title = tds[0].contents[2].encode("ascii", "ignore") petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo["petitioner"] = petitioner if respondent: metainfo["respondent"] = respondent return metainfo
def get_meta_info(self, tds, dateobj): metainfo = {'date': utils.date_to_xml(dateobj) } if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore') title = tds[0].contents[2].encode('ascii', 'ignore') petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent return metainfo
def get_meta_info(self, tds, dateobj): metainfo = {'date': utils.date_to_xml(dateobj)} if len(tds) > 0 and len(tds[0].contents) >= 3: metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore') title = tds[0].contents[2].encode('ascii', 'ignore') petitioner, respondent = utils.get_petitioner_respondent(title) if petitioner: metainfo['petitioner'] = petitioner if respondent: metainfo['respondent'] = respondent return metainfo
def get_meta_info(self, tr, baseurl): metainfo = {} tds = tr.findAll('td') i = 0 for td in tds: value = utils.get_tag_contents(td) i += 1 if value: if i == 1: metainfo[self.CASENO] = value elif i == 2: pet, res = utils.get_petitioner_respondent(value) if pet: metainfo[self.PETITIONER] = pet else: metainfo[self.PETITIONER] = value if res: metainfo[self.RESPONDENT] = res elif i == 3 or i == 4: dateobj = utils.datestr_to_obj(value) if dateobj: metainfo[self.DATE] = dateobj if not metainfo.has_key(self.DATE): self.logger.info(u'No date found %s' % metainfo) ms = [] if metainfo: self.logger.debug(u'metainfo: %s' % metainfo) links = tr.findAll('a') for link in links: href = link.get('href') if href: m = metainfo.copy() m['href'] = href m['url'] = urllib.basejoin(baseurl, href) ms.append(m) return ms