Example #1
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' %
                                         content)
                    if respondent:
                        metainfo['respondent'] = respondent
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
Example #2
0
 def get_meta_info(self, tr):
     metainfo = {}
     tds = tr.findAll('td')
    
     for link in tr.findAll('a'):
         href = link.get('href')
         if href:
             metainfo['href'] = href
             break
     if not metainfo.has_key('href'):
         return {}
     i = 0
     for td in tds:
         value = utils.get_tag_contents(td)
         if value:
             if i == 0:
                 metainfo[self.CASENO] = value
             elif i == 1:
                 pet, res = utils.get_petitioner_respondent(value)
                 if pet:
                     metainfo[self.PETITIONER] = pet
                 if res:
                     metainfo[self.RESPONDENT] = res
             elif i == 2:
                 dateobj = utils.datestr_to_obj(value)
                 if dateobj:
                     metainfo[self.DATE] = dateobj
             i += 1
     return metainfo
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        for link in tr.findAll('a'):
            href = link.get('href')
            if href:
                metainfo['href'] = href
                break
        if not metainfo.has_key('href'):
            return {}
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            if value:
                if i == 0:
                    metainfo[self.CASENO] = value
                elif i == 1:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif i == 2:
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
                i += 1
        return metainfo
Example #4
0
    def get_meta_info(self, tr):
        metainfo = {}
        tds = tr.findAll('td')

        i = 0
        lastcolumn = len(tds) - 1
        for td in tds:
            content = utils.get_tag_contents(td)
            if content:
                if i == 1:
                    content = u' '.join(content.split())
                    metainfo['caseno'] = content
                elif i == 2:
                    petitioner, respondent = \
                            utils.get_petitioner_respondent(content)
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    else:
                        self.logger.info(u'Petitioner not found in %s' % content)
                    if respondent:
                        metainfo['respondent'] = respondent 
                elif i == lastcolumn:
                    dateobj = utils.datestr_to_obj(content)
                    if dateobj:
                       metainfo[self.DATE] = dateobj
                    else:
                        self.logger.info(u'No date in %s' % (content))
                i += 1
        return metainfo
Example #5
0
    def get_meta_info(self, tr):
        tds = tr.findAll('td')
        metainfo = {}
        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                metainfo['href'] = href

        else:
            return metainfo

        valueList = []
        for td in tds:
            value = utils.get_tag_contents(td)
            valueList.append(value)

        i = 0
        for value in valueList:
            i += 1
            if value:
                value = value.strip()
                if (i == 2
                        or i == 3) and not metainfo.has_key(self.PETITIONER):
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                        metainfo[self.CASENO] = valueList[i - 1]
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif metainfo.has_key(self.PETITIONER):
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj

        # try one more heuristics
        if not metainfo.has_key(self.DATE) and metainfo.has_key('href'):
            dateobj = utils.datestr_to_obj(metainfo['href'])
            if dateobj:
                metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE) and \
                not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner/date found: %s %s' % \
                              (metainfo, valueList))
        elif not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner found: %s %s' % \
                                 (metainfo, valueList))
        elif not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found: %s %s' % \
                                 (metainfo, valueList))

        return metainfo
Example #6
0
    def get_meta_info(self, tr):
        tds = tr.findAll('td')
        metainfo = {}
        link = tr.find('a')
        if link:
            href = link.get('href')
            if href:
                metainfo['href'] = href

        else:
            return metainfo

        valueList = []
        for td in tds:
            value = utils.get_tag_contents(td)
            valueList.append(value)

        i = 0
        for value in valueList:
            i += 1
            if value:
                value = value.strip()
                if (i == 2 or i == 3) and not metainfo.has_key(self.PETITIONER):
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                        metainfo[self.CASENO] = valueList[i-1]
                    if res:
                        metainfo[self.RESPONDENT] = res
                elif metainfo.has_key(self.PETITIONER):
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj
      
        # try one more heuristics
        if not metainfo.has_key(self.DATE) and metainfo.has_key('href'): 
            dateobj = utils.datestr_to_obj(metainfo['href'])
            if dateobj:
                metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE) and \
                not metainfo.has_key(self.PETITIONER):
            self.logger.info(u'No petitioner/date found: %s %s' % \
                              (metainfo, valueList))
        elif not metainfo.has_key(self.PETITIONER): 
            self.logger.info(u'No petitioner found: %s %s' % \
                                 (metainfo, valueList))
        elif not metainfo.has_key(self.DATE): 
            self.logger.info(u'No date found: %s %s' % \
                                 (metainfo, valueList))

        return metainfo
Example #7
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo["caseno"] = tds[0].contents[0].encode("ascii", "ignore")

            title = tds[0].contents[2].encode("ascii", "ignore")
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo["petitioner"] = petitioner
            if respondent:
                metainfo["respondent"] = respondent

        return metainfo
Example #8
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj) }

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore')

            title = tds[0].contents[2].encode('ascii', 'ignore')
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo['petitioner'] = petitioner
            if respondent:
                metainfo['respondent'] = respondent

        return metainfo
Example #9
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore')

            title = tds[0].contents[2].encode('ascii', 'ignore')
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo['petitioner'] = petitioner
            if respondent:
                metainfo['respondent'] = respondent

        return metainfo
Example #10
0
    def get_meta_info(self, tr, baseurl):
        metainfo = {} 
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            i += 1
            if value:
                if i == 1:
                    metainfo[self.CASENO] = value
                elif i == 2:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    else:
                        metainfo[self.PETITIONER] = value

                    if res:
                        metainfo[self.RESPONDENT] = res 
                elif i == 3 or i == 4:
                   dateobj = utils.datestr_to_obj(value)
                   if dateobj:
                       metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found %s' % metainfo)

        ms = []
        if metainfo:
            self.logger.debug(u'metainfo: %s' % metainfo)
            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href:
                    m = metainfo.copy()
                    m['href'] = href
                    m['url']  =  urllib.basejoin(baseurl, href)
                    ms.append(m)
        return ms
Example #11
0
    def get_meta_info(self, tr, baseurl):
        metainfo = {}
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            value = utils.get_tag_contents(td)
            i += 1
            if value:
                if i == 1:
                    metainfo[self.CASENO] = value
                elif i == 2:
                    pet, res = utils.get_petitioner_respondent(value)
                    if pet:
                        metainfo[self.PETITIONER] = pet
                    else:
                        metainfo[self.PETITIONER] = value

                    if res:
                        metainfo[self.RESPONDENT] = res
                elif i == 3 or i == 4:
                    dateobj = utils.datestr_to_obj(value)
                    if dateobj:
                        metainfo[self.DATE] = dateobj

        if not metainfo.has_key(self.DATE):
            self.logger.info(u'No date found %s' % metainfo)

        ms = []
        if metainfo:
            self.logger.debug(u'metainfo: %s' % metainfo)
            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href:
                    m = metainfo.copy()
                    m['href'] = href
                    m['url'] = urllib.basejoin(baseurl, href)
                    ms.append(m)
        return ms