def get_meta(s): soup = bs4.BeautifulSoup(s, 'lxml') meta = {} case = soup.find(class_=u'case') meta['case'] = case.text parties = [x for x in case.children] separator = u" v " split = -1 for i, party in enumerate(parties): if isinstance(party, bs4.Tag) and party.text == separator: split = i left_parties = [unicode(x) for x in parties[:split]] right_parties = [unicode(x) for x in parties[split+1:]] meta['parties'] = { 'left': left_parties, 'right': right_parties, } meta['keywords'] = {} keywords = soup.find_all(class_=u'kw') for kw in keywords: words = kw.text.split(_EM_DASH) category = words[0].strip() others = words[1:] meta['keywords'][category] = [s.strip() for s in others] citations = soup.find_all(class_=u'ncit') if not len(citations) == 1: raise Exception(u'Could not parse any citations for this document') citation = citations[0] citations = [c.strip() for c in citation.text.split(";")] pattern = r"^\[\d\d\d\d\]\sWLR\s\(D\)" if not len(citations) > 0 and not re.match(pattern, citations[-1], re.UNICODE): raise Exception(u'Unable to parse citation for this document') else: neutral = citations[:-1] # Can be more than one iclr = citations[-1] # Only ever one for iclr publications meta['cite_as'] = { 'neutral': neutral, 'iclr': iclr, } judges_and_date = citation.find_next_sibling('p').text # Fields look reliably separated by newlines fields = [f.strip() for f in judges_and_date.split('\n')] # Colons carry no meaning and are commonly-mispelled twice fields = [f[:-2] if f.endswith(u'::') else f for f in fields] fields = [f[:-1] if f.endswith(u':') else f for f in fields] if not len(fields) == 3: raise Exception('Could not parse hearing details') court_abbr, judges, date = fields judges = [j.strip() for j in judges.split(',')] meta['hearing'] = { 'court_abbr': court_abbr, 'judges': judges, 'date': _date_to_js(_parse_date(date)), } # TODO: Reporter reporter_el = soup.find(class_=u'reporter') if reporter_el: reporter_str = reporter_el.text prefix = u'Reported by:' if reporter_str.startswith(prefix): reporter_str = reporter_str[len(prefix):] if reporter_str.endswith('.'): reporter_str = reporter_str[:-1] else: reporter_str = None def map_reporter(reporter): # Sometimes only the name is given reporter_fields = [f.strip() for f in reporter.split(',')] return { 'name': reporter_fields[0] if len(reporter_fields) > 0 else None, 'role': reporter_fields[1] if len(reporter_fields) > 1 else None, } if reporter_str is not None: reporters = [s.strip() for s in reporter_str.split('and')] reporters = map(map_reporter, reporters) else: reporters = [] meta['reporters'] = reporters # TODO: Appearances in court, these need proper parsing. They are tricky! appearances = soup.find_all(class_='hnote')[-1].find_next_sibling('p').text prefix = u'Appearances:' if not appearances.startswith(prefix): appearances = '' else: appearances = appearances[len(prefix):].strip() meta['appearances'] = appearances # TODO: Body of report paras = soup.find_all(class_=u'hnote') body = u'\n\n'.join(p.text for p in paras) meta['body'] = body # TODO: Cited cases/legislation # TODO: Check len(citations) == len(candidates) # A common inconsistency is a reference to an act in square # brackets, like this: "Under the [2004] Act". Normally square # brackets indicate a citation of case-law. tokenized = tokenization.tokenize(body) tokenized_titles = ttls.find_titles(tokenized) citations = ctns.find_citations(body) candidate_citations = ctns.weak_find_citations(body) legislation = lgs.find_legislation(tokenized_titles, body) candidate_legislation = lgs.find_candidate_legislation(body) meta['citations'] = list(set(citations)) meta['candidate_citations'] = candidate_citations meta['legislation'] = list(set(legislation)) meta['candidate_legislation'] = candidate_legislation # TODO: Which party won? return meta
def test_find_citations(text, expected): actual = ctns.find_citations(text) assert expected == actual