Example #1
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        for td in tr.findAll('td'):
            text = utils.get_tag_contents(td)
            if text:
                reobj = re.search('\s+vs\s+', text, re.IGNORECASE)
                if reobj:
                    caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)',
                                          text, re.IGNORECASE)
                    if caseReobj and caseReobj.end() < reobj.start():
                        groupdict = caseReobj.groupdict()
                        metainfo['caseno'] = u'%s/%s' % (groupdict['num'],
                                                         groupdict['year'])

                        petitioner = text[caseReobj.end():reobj.start()]
                    else:
                        petitioner = text[:reobj.start()]

                    if reobj.end() < len(text):
                        respondent = text[reobj.end():]
                        metainfo['respondent'] = respondent.strip()

                    metainfo['petitioner'] = petitioner.strip()
                    break

        return metainfo
Example #2
0
    def save_meta_tags(self, metapath, judgedict, dateobj):
        tagdict  = {}

        if judgedict.has_key('title'):
            title = judgedict['title']
            tagdict['title'] = title
  
            reobj = re.search('( vs | vs\.)', title, re.IGNORECASE)
            if reobj:
                if reobj.start() > 1:
                    petitioner = title[:reobj.start()]
                    tagdict['petitioner'] = petitioner

                if reobj.end() + 1 < len(title):
                    respondent = title[reobj.end() + 1:]
                    tagdict['respondent'] = respondent

        if judgedict.has_key('bench'):
            bench = judgedict['bench'].split(',')
            if len(bench) > 0:
                benchdict = {}
                benchdict['name'] = []
                for judge in bench:
                    benchdict['name'].append(judge)
                tagdict['bench'] = benchdict
 
        tagdict['date'] = utils.date_to_xml(dateobj)

        utils.print_tag_file(metapath, tagdict) 
Example #3
0
    def save_meta_tags(self, metapath, judgedict, dateobj):
        tagdict  = {}

        if judgedict.has_key('title'):
            title = judgedict['title']
            tagdict['title'] = title
  
            reobj = re.search('( vs | vs\.)', title, re.IGNORECASE)
            if reobj:
                if reobj.start() > 1:
                    petitioner = title[:reobj.start()]
                    tagdict['petitioner'] = petitioner

                if reobj.end() + 1 < len(title):
                    respondent = title[reobj.end() + 1:]
                    tagdict['respondent'] = respondent

        if judgedict.has_key('bench'):
            bench = judgedict['bench'].split(',')
            if len(bench) > 0:
                benchdict = {}
                benchdict['name'] = []
                for judge in bench:
                    benchdict['name'].append(judge)
                tagdict['bench'] = benchdict
 
        tagdict['date'] = utils.date_to_xml(dateobj)

        utils.print_tag_file(metapath, tagdict) 
Example #4
0
    def parse_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}

        tds = tr.findAll('td')
        i = 0
        for td in tds:
            c = utils.get_tag_contents(td)
            if c:
                if i == 0:
                    contents = utils.tag_contents_without_recurse(td)
                    names = []
                    for content in contents:
                        reobj = re.search('JUSTICE ', content)
                        if reobj:
                            names.append(content[reobj.end():])

                    if names: 
                        metainfo['bench'] = {}
                        metainfo['bench']['name']  = names

                elif i == 1:
                    metainfo['category'] = c
                elif i == 3:
                    metainfo['caseno']   = c
                    
                i += 1

        return metainfo
Example #5
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        for td in tr.findAll('td'):
            text = utils.get_tag_contents(td)
            if text:
                reobj = re.search('\s+vs\s+', text, re.IGNORECASE)
                if reobj:
                    caseReobj = re.search('(?P<num>\d+)\s+of\s+(?P<year>\d+)', text, re.IGNORECASE)
                    if caseReobj and caseReobj.end() < reobj.start():
                        groupdict = caseReobj.groupdict()
                        metainfo['caseno'] = u'%s/%s' % (groupdict['num'], groupdict['year'])
                       
                        petitioner = text[caseReobj.end():reobj.start()]
                    else:
                        petitioner = text[:reobj.start()]

                    if reobj.end() < len(text):
                        respondent = text[reobj.end():]
                        metainfo['respondent'] = respondent.strip()

                    metainfo['petitioner'] = petitioner.strip()
                    break
                    
        return metainfo
Example #6
0
    def get_meta_info(self, title, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}
        reobj = re.search('Dated', title)
        if reobj:
            title = title[:reobj.start()]
        metainfo['caseno'] = title

        return metainfo
Example #7
0
    def download_doc(self, baseurl, info, relpath):
        judgeurl = urllib.basejoin(baseurl, info['href'])
        filename = info['href'].split('/')[-1]

        filename = u' '.join(filename.split())
        filename = re.sub('/|&|\(|\)', '-', filename)
        relurl = os.path.join(relpath, filename)
        info['date'] = utils.date_to_xml(info['date'])
        return self.save_judgment(relurl, judgeurl, info)
Example #8
0
 def download_doc(self, baseurl, info, relpath):
     judgeurl =  urllib.basejoin(baseurl, info['href'])
     filename = info['href'].split('/')[-1]
     
     filename = u' '.join(filename.split())
     filename = re.sub('/|&|\(|\)', '-', filename)
     relurl = os.path.join(relpath, filename)
     info['date'] = utils.date_to_xml(info['date'])
     return self.save_judgment(relurl, judgeurl, info)
Example #9
0
    def get_meta_info(self, link, tr, dateobj):
        metainfo = {'date':utils.date_to_xml(dateobj)}
        metainfo['caseno'] = utils.get_tag_contents(link)
        tds = tr.findAll('td')
        for td in tds:
            contents = utils.get_tag_contents(td)
            reobj = re.search('JUSTICE ', contents)
            if  reobj:
               metainfo['author'] = contents[reobj.end():]

        return metainfo
Example #10
0
    def process_order_tr(self, ccin, relpath, dateobj, tr, fieldOrder):
        tds =  tr.findAll('td')
        viewIndex  = fieldOrder['view']
        dateIndex  = fieldOrder['date']
        if viewIndex >= len(tds) or dateIndex >= len(tds):
            self.logger.warning(u'Could not get date or view in tr: %s' % tr)
            return None

        viewTd  = tds[viewIndex]
        dateTd  = tds[dateIndex]

        datestr = utils.get_tag_contents(dateTd)

        if not datestr:
            self.logger.warning(u'Date: %s Could not get date in %s' % (dateobj, tr))
            return None

        subdateobj = utils.datestr_to_obj(datestr)
        if not subdateobj:
            self.logger.warning(u'Date: %s Could not get date in %s tr: %s' % (dateobj, datestr, tr))
            return None

        subdateobj = subdateobj.date() 
        metainfo = {'date':utils.date_to_xml(subdateobj), 'ccin': ccin}

        # store bench in metainfo
        if 'bench' in fieldOrder and fieldOrder['bench'] < len(tds):
            benchIndex = fieldOrder['bench']
            benchTd = tds[benchIndex]
            contents = utils.get_tag_contents(benchTd)
            if contents:
                names = []
                for reobj in re.finditer('JUSTICE ', contents):
                    names.append(contents[reobj.end():])
                if names:
                    metainfo['bench'] = {} 
                    metainfo['bench']['name'] = names

        # store isJudgment in metainfo
        if 'judgment' in fieldOrder and fieldOrder['judgment'] < len(tds):
            jTd = tds[fieldOrder['judgment']]
            contents = utils.get_tag_contents(jTd)
            if contents:
                metainfo['judgment'] = contents

        onclick  = viewTd.get('onclick')
        if onclick:
            relurl = self.download_order(relpath, subdateobj, \
                                             metainfo, onclick)
            return relurl
        else:
             self.logger.warning(u'No onclick attribute in viewTd: %s' % viewTd)
        return None 
Example #11
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore')

            title = tds[0].contents[2].encode('ascii', 'ignore')
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo['petitioner'] = petitioner
            if respondent:
                metainfo['respondent'] = respondent

        return metainfo
Example #12
0
    def get_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}

        tds = tr.findAll('td')

        if len(tds) >= 3:
            metainfo['caseno'] = utils.get_tag_contents(tds[2])

        if len(tds) >= 4:
            metainfo['petitioner'] = utils.get_tag_contents(tds[3])

        if len(tds) >= 5:
            metainfo['respondent'] = utils.get_tag_contents(tds[4])

        return metainfo
Example #13
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo["caseno"] = tds[0].contents[0].encode("ascii", "ignore")

            title = tds[0].contents[2].encode("ascii", "ignore")
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo["petitioner"] = petitioner
            if respondent:
                metainfo["respondent"] = respondent

        return metainfo
Example #14
0
    def get_meta_info(self, tds, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj) }

        if len(tds) > 0 and len(tds[0].contents) >= 3:
            metainfo['caseno'] = tds[0].contents[0].encode('ascii', 'ignore')

            title = tds[0].contents[2].encode('ascii', 'ignore')
            petitioner, respondent = utils.get_petitioner_respondent(title)

            if petitioner:
                metainfo['petitioner'] = petitioner
            if respondent:
                metainfo['respondent'] = respondent

        return metainfo
Example #15
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = { 'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon+1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)
 
            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)
        
        # next link
        links = d.findAll('a')
        for link in links: 
            t = utils.get_tag_contents(link)          
            if re.search('Next', t):
                href = link.get('href')
             
                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True
                   
                judgments.append(judgment)
 
        return judgments
Example #16
0
    def parse_result_page(self, posturl, webpage, dateobj):
        judgments = []
        d = utils.parse_webpage(webpage)
        if not d:
            self.logger.error(u'Could not parse result page %s' % dateobj)
            return judgments

        # get judgments
        trs = d.findAll('tr')
        for tr in trs:
            judgment = {}
            metainfo = {'date': utils.date_to_xml(dateobj)}

            links = tr.findAll('a')
            for link in links:
                href = link.get('href')
                if href and re.search('WebShowJudgment.do', href):
                    t = utils.get_tag_contents(link)
                    colon = t.find(':')
                    if colon:
                        title = t[colon + 1:]
                        title = title.strip()
                        metainfo['title'] = title
                        reobj = re.search(' vs\. ', title, re.IGNORECASE)
                        if reobj:
                            metainfo['petitioner'] = title[:reobj.start()]
                            metainfo['respondent'] = title[reobj.end():]
                if href and re.search('WebDownloadJudgmentDocument.do', href):
                    judgment['link'] = urllib.basejoin(posturl, href)

            if judgment:
                judgment['metainfo'] = metainfo
                judgments.append(judgment)

        # next link
        links = d.findAll('a')
        for link in links:
            t = utils.get_tag_contents(link)
            if re.search('Next', t):
                href = link.get('href')

                if href:
                    judgment = {'link': urllib.basejoin(posturl, href)}
                    judgment['next'] = True

                judgments.append(judgment)

        return judgments
Example #17
0
    def get_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj) }
        tds = tr.findAll('td')
        for td in tds:
            content = utils.get_tag_contents(td)
            reobj = re.search(' vs\.? ', content, re.IGNORECASE)
            if reobj:
                metainfo['title']      = content
                metainfo['petitioner'] = content[:reobj.start()]
                metainfo['respondent'] = content[reobj.end():]

            reobj = re.search('justice ', content, re.IGNORECASE)
            if reobj:
                metainfo['author'] = content[reobj.end():]
                 
        return metainfo
Example #18
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}
        tds = tr.findAll('td')
        for td in tds:
            content = utils.get_tag_contents(td)
            reobj = re.search(' vs\.? ', content, re.IGNORECASE)
            if reobj:
                metainfo['title'] = content
                metainfo['petitioner'] = content[:reobj.start()]
                metainfo['respondent'] = content[reobj.end():]

            reobj = re.search('justice ', content, re.IGNORECASE)
            if reobj:
                metainfo['author'] = content[reobj.end():]

        return metainfo
Example #19
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}
        tds = tr.findAll("td")
        for td in tds:
            content = utils.get_tag_contents(td)
            reobj = re.search(" vs\.? ", content, re.IGNORECASE)
            if reobj:
                metainfo["title"] = content
                metainfo["petitioner"] = content[: reobj.start()]
                metainfo["respondent"] = content[reobj.end() :]

            reobj = re.search("justice ", content, re.IGNORECASE)
            if reobj:
                metainfo["author"] = content[reobj.end() :]

        return metainfo
Example #20
0
    def get_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            txt = utils.get_tag_contents(td)
            if txt:
                reobj = re.search(' vs ', txt, re.IGNORECASE)
                if reobj:
                    petitioner = string.strip(txt[:reobj.start()], ' \r\n-') 
                    respondent = string.strip(txt[reobj.end():], ' \r\n-')
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    if respondent:
                        metainfo['respondent'] = respondent
                elif i == 2:
                    metainfo['caseno'] = txt
                i += 1
 
        return metainfo
Example #21
0
    def parse_meta_info(self, tr, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}

        i = 0
        for td in tr.findAll("td"):
            contents = utils.get_tag_contents(td)
            if i == 1:
                metainfo["caseno"] = contents
            elif i == 3:
                reobj = re.search(" vs\.? ", contents, re.IGNORECASE)
                if reobj:
                    metainfo["petitioner"] = contents[: reobj.start()]
                    metainfo["respondent"] = contents[reobj.end() :]
            elif i == 4:
                reobj = re.search("JUSTICE ", contents)
                if reobj:
                    metainfo["author"] = contents[reobj.end() :]

            i += 1
        return metainfo
Example #22
0
    def parse_meta_info(self, tr, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj)}

        i = 0
        for td in tr.findAll('td'):
            contents = utils.get_tag_contents(td)
            if i == 1:
                metainfo['caseno'] = contents
            elif i == 3:
                reobj = re.search(' vs\.? ', contents, re.IGNORECASE)
                if reobj:
                    metainfo['petitioner'] = contents[:reobj.start()]
                    metainfo['respondent'] = contents[reobj.end():]
            elif i == 4:
                reobj = re.search('JUSTICE ', contents)
                if reobj:
                    metainfo['author'] = contents[reobj.end():]             
                
            i += 1
        return metainfo
Example #23
0
    def parse_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}

        i = 0
        for td in tr.findAll('td'):
            contents = utils.get_tag_contents(td)
            if i == 1:
                metainfo['caseno'] = contents
            elif i == 3:
                reobj = re.search(' vs\.? ', contents, re.IGNORECASE)
                if reobj:
                    metainfo['petitioner'] = contents[:reobj.start()]
                    metainfo['respondent'] = contents[reobj.end():]
            elif i == 4:
                reobj = re.search('JUSTICE ', contents)
                if reobj:
                    metainfo['author'] = contents[reobj.end():]

            i += 1
        return metainfo
Example #24
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}
        tds = tr.findAll('td')
        i = 0
        for td in tds:
            txt = utils.get_tag_contents(td)
            if txt:
                reobj = re.search(' vs ', txt, re.IGNORECASE)
                if reobj:
                    petitioner = string.strip(txt[:reobj.start()], ' \r\n-')
                    respondent = string.strip(txt[reobj.end():], ' \r\n-')
                    if petitioner:
                        metainfo['petitioner'] = petitioner
                    if respondent:
                        metainfo['respondent'] = respondent
                elif i == 2:
                    metainfo['caseno'] = txt
                i += 1

        return metainfo
Example #25
0
    def get_meta_info(self, d, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj) }
        trs = d.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')

            i = 0
            tdtype = None
            for td in tds[:-1]:
                 content = utils.get_tag_contents(td)

                 if re.search('Case Number', content):
                     tdtype = 'caseno'
                     break

                 if re.search('Judge', content):
                     tdtype = 'author'
                     break

                 if re.search('Petitioner', content):
                     tdtype = 'petitioner'
                     break

                 if re.search('Respondent', content):
                     tdtype = 'respondent'
                     break

                 if re.search('Location', content):
                     tdtype = 'location'
                     break


                 i += 1
            if tdtype and i + 1 < len(tds):
                 content = utils.get_tag_contents(td)
                 metainfo[tdtype] = utils.get_tag_contents(tds[i+1])

        return metainfo             
Example #26
0
    def get_meta_info(self, d, dateobj):
        metainfo = { 'date': utils.date_to_xml(dateobj) }
        trs = d.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')

            i = 0
            tdtype = None
            for td in tds[:-1]:
                 content = utils.get_tag_contents(td)

                 if re.search('Case Number', content):
                     tdtype = 'caseno'
                     break

                 if re.search('Judge', content):
                     tdtype = 'author'
                     break

                 if re.search('Petitioner', content):
                     tdtype = 'petitioner'
                     break

                 if re.search('Respondent', content):
                     tdtype = 'respondent'
                     break

                 if re.search('Location', content):
                     tdtype = 'location'
                     break


                 i += 1
            if tdtype and i + 1 < len(tds):
                 content = utils.get_tag_contents(td)
                 metainfo[tdtype] = utils.get_tag_contents(tds[i+1])

        return metainfo             
Example #27
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}
        tds = tr.findAll("td")

        i = 0
        for td in tds:
            content = utils.get_tag_contents(td)

            if i == 1:
                metainfo["caseno"] = content

            elif i == 2:
                metainfo["petitioner"] = content
                metainfo["title"] = content
            elif i == 3:
                metainfo["respondent"] = content
                if metainfo.has_key("title"):
                    metainfo["title"] += " " + content
                else:
                    metainfo["title"] = content

            i += 1
        return metainfo
Example #28
0
    def get_meta_info(self, tr, dateobj):
        metainfo = {'date': utils.date_to_xml(dateobj)}
        tds = tr.findAll('td')

        i = 0
        for td in tds:
            content = utils.get_tag_contents(td)

            if i == 1:
                metainfo['caseno'] = content

            elif i == 2:
                metainfo['petitioner'] = content
                metainfo['title'] = content
            elif i == 3:
                metainfo['respondent'] = content
                if metainfo.has_key('title'):
                    metainfo['title'] += ' ' + content
                else:
                    metainfo['title'] = content

            i += 1
        return metainfo
Example #29
0
    def get_meta_info(self, d, dateobj):
        metainfo = {"date": utils.date_to_xml(dateobj)}
        trs = d.findAll("tr")
        for tr in trs:
            tds = tr.findAll("td")

            i = 0
            tdtype = None
            for td in tds[:-1]:
                content = utils.get_tag_contents(td)

                if re.search("Case Number", content):
                    tdtype = "caseno"
                    break

                if re.search("Judge", content):
                    tdtype = "author"
                    break

                if re.search("Petitioner", content):
                    tdtype = "petitioner"
                    break

                if re.search("Respondent", content):
                    tdtype = "respondent"
                    break

                if re.search("Location", content):
                    tdtype = "location"
                    break

                i += 1
            if tdtype and i + 1 < len(tds):
                content = utils.get_tag_contents(td)
                metainfo[tdtype] = utils.get_tag_contents(tds[i + 1])

        return metainfo
Example #30
0
 def save_meta_tags(self, metapath, debatedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in debatedict.keys():
         if k not in ['href']:
             tagdict[k] = debatedict[k]    
     utils.print_tag_file(metapath, tagdict)
Example #31
0
 def save_meta_tags(self, metapath, judgedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in judgedict.keys():
         if k not in [self.HREF]:
             tagdict[k] = judgedict[k]
     utils.print_tag_file(metapath, tagdict)
Example #32
0
 def save_meta_tags(self, metapath, judgedict, dateobj):
     tagdict = {'date': utils.date_to_xml(dateobj)}
     for k in judgedict.keys():
         if k not in [self.HREF]:
             tagdict[k] = judgedict[k]
     utils.print_tag_file(metapath, tagdict)