def find_judgment(soup): reg = judgment_reg start = find_reg_el(soup, reg) if start: results = [start] + find_until(start, re.compile('\w+:'), use_left=False) return [re.sub(reg, '', ' '.join(map(lambda x: x.text, results)))] else: return None
def find_judgment(soup): reg = judgment_reg start = find_reg_el(soup, reg) if start: results = [start] + find_until( start, re.compile('\w+:'), use_left=False) return [re.sub(reg, '', ' '.join(map(lambda x: x.text, results)))] else: return None
def find_solicitors(soup): reg = re.compile(r'^solicitors?(/counsel)?s?[:;]?\W*', flags=re.IGNORECASE) start = find_reg_el(soup, reg, field=["footer-field", "intituling-field"]) strings = [] if start: if re.sub(reg, '', start.text): strings += [re.sub(reg, '', start.text)] results = find_until(start, None, use_left=False) strings += map(lambda x: x.text, results) strings = filter(lambda x: not x.startswith('('), strings) strings = filter(lambda x: x, strings) return strings
def find_solicitors(soup): reg = re.compile(r'^solicitors?(/counsel)?s?[:;]?\W*', flags=re.IGNORECASE) start = find_reg_el(soup, reg, field=["footer-field", "intituling-field"]) strings = [] if start: if re.sub(reg, '', start.text): strings += [re.sub(reg, '', start.text)] results = find_until(start, None, use_left=False) strings += map(lambda x: x.text, results) strings = filter(lambda x: not x.startswith('('), strings) strings = filter(lambda x: x, strings) return strings
def find_hearing(soup): reg = hearing_reg start = find_reg_el(soup, reg) if start: more_left = False use_left = True if re.sub(reg, '', start.text).strip(): use_left = False more_left = True else: start = start.next_sibling results = [start] + find_until(start, re.compile('\w+:'), use_left=use_left, more_left=more_left) return [re.sub(reg, '', ' '.join(map(lambda x: x.text, results)))] else: return None
def matter_loop(soup, next_qualifier, results, courtfile=False): if not len(results): results.append(soup.new_tag('matters')) matter = soup.new_tag('matter') qualifier = soup.new_tag('qualifier') value = soup.new_tag('value') remainder_text = re.sub(matter_pattern, '', re.sub(join_pattern, '', next_qualifier.text)).strip() segments = [] """ look before qualifer and see if there is a court file number """ if courtfile and courtfile_num.match(next_qualifier.previous_sibling.text): results.append(soup.new_tag('matters')) courtfile = soup.new_tag('court-file') courtfile.string = next_qualifier.previous_sibling.text results[-1].append(courtfile) if len(remainder_text): qualifier.string = next_qualifier.text.replace(remainder_text, '') elif not (get_left(next_qualifier.next_sibling) > get_left(next_qualifier) ) and join_pattern.match(next_qualifier.text): matter_join = soup.new_tag('matter-join') matter_join.string = next_qualifier.text results[-1].append(matter_join) return next_qualifier.next_sibling else: qualifier.string = next_qualifier.text next_qualifier = next_qualifier.next_sibling segments += [next_qualifier] segments += find_until(next_qualifier, matter_pattern, more_left=bool(len(remainder_text)), use_left=not bool(len(remainder_text))) value.string = u' '.join( filter(None, [remainder_text] + map(lambda x: x.text, segments))) segments.insert(0, next_qualifier) next_qualifier = segments[-1].next_sibling if next_qualifier and is_left_aligned(next_qualifier): next_qualifier = next_qualifier.next_sibling matter.append(qualifier) matter.append(value) results[-1].append(matter) return next_qualifier
def find_hearing(soup): reg = hearing_reg start = find_reg_el(soup, reg) if start: more_left = False use_left = True if re.sub(reg, '', start.text).strip(): use_left = False more_left = True else: start = start.next_sibling results = [start] + find_until( start, re.compile('\w+:'), use_left=use_left, more_left=more_left) return [re.sub(reg, '', ' '.join(map(lambda x: x.text, results)))] else: return None
def waistband(soup): titles = get_band(soup.find('hline').parent) parts = get_band(soup.find_all('intituling-field')[-1], reverse=True) parts = filter(lambda p: p not in titles, parts) if not parts: parts = find_until(titles[-1], use_left=False) parts = filter(lambda x: x.text, parts) waistband = soup.new_tag('waistband') for t in titles: title = soup.new_tag('title') title.string = t.text waistband.append(title) counter = 'A' for part in parts: text = part.text.strip() if not text: continue if part.find('underline'): subtitle = soup.new_tag('subtitle') subtitle.string = text waistband.append(subtitle) elif separator_reg.match(text): continue elif re.match('%s\.?($|\s)' % counter, text): if waistband.contents[-1].name != 'list': waistband.append(soup.new_tag('list')) entry = soup.new_tag('entry') label = soup.new_tag('label') label.string = counter entry.append(label) text_el = soup.new_tag('text') text_el.string = text[2:].strip() entry.append(text_el) waistband.contents[-1].append(entry) counter = chr(ord(counter) + 1) else: if not waistband.find('text'): text_el = soup.new_tag('text') waistband.append(text_el) last_text = waistband.find_all('text')[-1] if len(last_text.contents) and last_text.contents[-1]: last_text.append(' ') last_text.append(text) return waistband
def waistband(soup): titles = get_band(soup.find('hline').parent) parts = get_band(soup.find_all('intituling-field')[-1], reverse=True) parts = filter(lambda p: p not in titles, parts) if not parts: parts = find_until(titles[-1], use_left=False) parts = filter(lambda x: x.text, parts) waistband = soup.new_tag('waistband') for t in titles: title = soup.new_tag('title') title.string = t.text waistband.append(title) counter = 'A' for part in parts: text = part.text.strip() if not text: continue if part.find('underline'): subtitle = soup.new_tag('subtitle') subtitle.string = text waistband.append(subtitle) elif separator_reg.match(text): continue elif re.match('%s\.?($|\s)' % counter, text): if waistband.contents[-1].name != 'list': waistband.append(soup.new_tag('list')) entry = soup.new_tag('entry') label = soup.new_tag('label') label.string = counter entry.append(label) text_el = soup.new_tag('text') text_el.string = text[2:].strip() entry.append(text_el) waistband.contents[-1].append(entry) counter = chr(ord(counter) + 1) else: if not waistband.find('text'): text_el = soup.new_tag('text') waistband.append(text_el) last_text = waistband.find_all('text')[-1] if len(last_text.contents) and last_text.contents[-1]: last_text.append(' ') last_text.append(text) return waistband
def find_versus(soup, start): """ If find_parties fails, assume this """ start = find_intituling(start, re.compile('^\s*[vV]\s*$')) if not start: return [], start parties = { 'plantiffs': [{ 'value': start.previous_sibling.string, 'versus': True }] } parties['court-file'] = court_file_before(soup, start.previous_sibling) defendants = [start.next_sibling] + find_until(start.next_sibling, use_left=False) parties['defendants'] = [{ 'value': ' '.join(map(lambda x: x.text, defendants)), 'versus': True }] return [parties], defendants[-1].next_sibling
def matter_loop(soup, next_qualifier, results, courtfile=False): if not len(results): results.append(soup.new_tag('matters')) matter = soup.new_tag('matter') qualifier = soup.new_tag('qualifier') value = soup.new_tag('value') remainder_text = re.sub(matter_pattern, '', re.sub(join_pattern, '', next_qualifier.text)).strip() segments = [] """ look before qualifer and see if there is a court file number """ if courtfile and courtfile_num.match(next_qualifier.previous_sibling.text): results.append(soup.new_tag('matters')) courtfile = soup.new_tag('court-file') courtfile.string = next_qualifier.previous_sibling.text results[-1].append(courtfile) if len(remainder_text): qualifier.string = next_qualifier.text.replace(remainder_text, '') elif not (get_left(next_qualifier.next_sibling) > get_left(next_qualifier)) and join_pattern.match(next_qualifier.text): matter_join = soup.new_tag('matter-join') matter_join.string = next_qualifier.text results[-1].append(matter_join) return next_qualifier.next_sibling else: qualifier.string = next_qualifier.text next_qualifier = next_qualifier.next_sibling segments += [next_qualifier] segments += find_until(next_qualifier, matter_pattern, more_left=bool(len(remainder_text)), use_left=not bool(len(remainder_text))) value.string = u' '.join(filter(None, [remainder_text] + map(lambda x: x.text, segments))) segments.insert(0, next_qualifier) next_qualifier = segments[-1].next_sibling if next_qualifier and is_left_aligned(next_qualifier): next_qualifier = next_qualifier.next_sibling matter.append(qualifier) matter.append(value) results[-1].append(matter) return next_qualifier
def find_versus(soup, start): """ If find_parties fails, assume this """ start = find_intituling(start, re.compile('^\s*[vV]\s*$')) if not start: return [], start parties = { 'plantiffs': [{ 'value': start.previous_sibling.string, 'versus': True }] } parties['court-file'] = court_file_before(soup, start.previous_sibling) defendants = [start.next_sibling] + find_until(start.next_sibling, use_left=False) parties['defendants'] = [{ 'value': ' '.join(map(lambda x: x.text, defendants)), 'versus': True }] return [parties], defendants[-1].next_sibling
def find_registry(soup): start = find_reg_el(soup, court_reg) registry = find_until(start, None, use_left=True) if registry: return [registry[-1].text]
def find_parties(soup, start): found = [False] party_dict = {'plantiffs': [], 'defendants': [], 'thirdparties': [], 'court-file': None} parties = [copy.deepcopy(party_dict)] def get_group(descriptor): group = 'defendants' if plantiff_pattern.match(descriptor): group = 'plantiffs' elif thirdparty_pattern.match(descriptor): group = 'thirdparties' return group def split_by_capital(column): i = (i for i, v in enumerate(column) if v.text.upper() != v.text).next() return (column[0:i], column[i:]) def add_persons(qualifier, column): name, descriptor = split_by_capital(column) name = ' '.join([n.text for n in name]) descriptor = ' '.join([d.text for d in descriptor]) group = get_group(descriptor) found[0] = True parties[-1][group].append({ 'qualifier': qualifier, 'value': name, 'descriptor': descriptor }) def split_courtfiles(qualifier, descriptor, column): i = 0 group = get_group(descriptor) while i < len(column): name = [] courtfiles = [] while i < len(column) and not courtfile_num_std_embed.search(column[i].text): name.append(column[i].text) i += 1 remainder_name = re.sub(courtfile_num_std_embed, '', column[i].text).strip() if not i < len(column): break if remainder_name: name.append(remainder_name) courtfiles.append(column[i].text.replace(remainder_name, '')) i += 1 while i < len(column) and courtfile_num_std_embed.match(column[i].text): courtfiles.append(column[i].text) i += 1 parties[-1][group].append({ 'qualifier': qualifier, 'value': ' '.join(name), 'courtfile': courtfiles }) qualifier = None found[0] = True parties[-1][group][-1]['descriptor'] = descriptor plantiff_pattern = re.compile('.*(Plaintiff|Applicant|Appellant|Insolvent)s?') thirdparty_pattern = re.compile('.*(Third [Pp]arty|Third [Pp]arties|interested party)') next_qualifier = find_intituling(start, start_qualifier_pattern) if not next_qualifier: raise AttributeError parties[-1]['court-file'] = court_file_before(soup, next_qualifier) while next_qualifier and qualifier_pattern.match(next_qualifier.text): remainder_text = re.sub(qualifier_pattern, '', next_qualifier.text).strip() segments = [] if len(remainder_text): qualifier_text = next_qualifier.text.replace(remainder_text, '') first = lambda: None first.text = remainder_text segments += [first] else: qualifier_text = next_qualifier.text next_qualifier = next_qualifier.next_sibling segments += [next_qualifier] more_left = bool(len(remainder_text)) use_left = not more_left segments += find_until(next_qualifier, more_left=more_left, use_left=use_left) # see ALESCO NEW ZEALAND LIMITED V COMMISSIONER OF INLAND REVENUE HC AK CIV 2009-404- 2145 15 April 2011 # has name running onto next line next_qualifier = segments[-1].next_sibling if next_qualifier and courtfile_num.match(next_qualifier.text) and is_bold(next_qualifier) == is_bold(segments[-1]): parties[-1]['court-file'] = [] segments = [segments[0]] + find_until(segments[0], use_left=False, more_equal_left=True) next_qualifier = segments[-1].next_sibling names, descriptor = split_by_capital(segments) descriptor = ' '.join([d.text for d in descriptor]) split_courtfiles(qualifier_text, descriptor, names) else: """ Must also split on lines that aren't all caps """ splits = case_change_indices(segments) for seg in indexsplit(segments, *splits): add_persons(qualifier_text, seg) if next_qualifier and not qualifier_pattern.match(next_qualifier.text) and courtfile_num.match(next_qualifier.text): parties += [copy.deepcopy(party_dict)] court_files = [] while courtfile_num.match(next_qualifier.text): court_files += [next_qualifier.text] next_qualifier = next_qualifier.next_sibling parties[-1]['court-file'] = court_files return parties if found[0] else [], next_qualifier
def find_counsel(soup): reg = counsel_reg start = find_reg_el(soup, reg) results = [start] + find_until(start, re.compile('\w+:'), use_left=False) return filter(None, map(lambda x: re.sub(reg, '', x.text.strip()), results))
def find_counsel(soup): reg = counsel_reg start = find_reg_el(soup, reg) results = [start]+ find_until(start, re.compile('\w+:'), use_left=False) return filter(None, map(lambda x: re.sub(reg, '', x.text.strip()), results))
def find_parties(soup, start): found = [False] party_dict = { 'plantiffs': [], 'defendants': [], 'thirdparties': [], 'court-file': None } parties = [copy.deepcopy(party_dict)] def get_group(descriptor): group = 'defendants' if plantiff_pattern.match(descriptor): group = 'plantiffs' elif thirdparty_pattern.match(descriptor): group = 'thirdparties' return group def split_by_capital(column): i = (i for i, v in enumerate(column) if v.text.upper() != v.text).next() return (column[0:i], column[i:]) def add_persons(qualifier, column): name, descriptor = split_by_capital(column) name = ' '.join([n.text for n in name]) descriptor = ' '.join([d.text for d in descriptor]) group = get_group(descriptor) found[0] = True parties[-1][group].append({ 'qualifier': qualifier, 'value': name, 'descriptor': descriptor }) def split_courtfiles(qualifier, descriptor, column): i = 0 group = get_group(descriptor) while i < len(column): name = [] courtfiles = [] while i < len(column) and not courtfile_num_std_embed.search( column[i].text): name.append(column[i].text) i += 1 remainder_name = re.sub(courtfile_num_std_embed, '', column[i].text).strip() if not i < len(column): break if remainder_name: name.append(remainder_name) courtfiles.append(column[i].text.replace(remainder_name, '')) i += 1 while i < len(column) and courtfile_num_std_embed.match( column[i].text): courtfiles.append(column[i].text) i += 1 parties[-1][group].append({ 'qualifier': qualifier, 'value': ' '.join(name), 'courtfile': courtfiles }) qualifier = None found[0] = True parties[-1][group][-1]['descriptor'] = descriptor plantiff_pattern = re.compile( '.*(Plaintiff|Applicant|Appellant|Insolvent)s?') thirdparty_pattern = re.compile( '.*(Third [Pp]arty|Third [Pp]arties|interested party)') next_qualifier = find_intituling(start, start_qualifier_pattern) if not next_qualifier: raise AttributeError parties[-1]['court-file'] = court_file_before(soup, next_qualifier) while next_qualifier and qualifier_pattern.match(next_qualifier.text): remainder_text = re.sub(qualifier_pattern, '', next_qualifier.text).strip() segments = [] if len(remainder_text): qualifier_text = next_qualifier.text.replace(remainder_text, '') first = lambda: None first.text = remainder_text segments += [first] else: qualifier_text = next_qualifier.text next_qualifier = next_qualifier.next_sibling segments += [next_qualifier] more_left = bool(len(remainder_text)) use_left = not more_left segments += find_until(next_qualifier, more_left=more_left, use_left=use_left) # see ALESCO NEW ZEALAND LIMITED V COMMISSIONER OF INLAND REVENUE HC AK CIV 2009-404- 2145 15 April 2011 # has name running onto next line next_qualifier = segments[-1].next_sibling if next_qualifier and courtfile_num.match( next_qualifier.text) and is_bold(next_qualifier) == is_bold( segments[-1]): parties[-1]['court-file'] = [] segments = [segments[0]] + find_until( segments[0], use_left=False, more_equal_left=True) next_qualifier = segments[-1].next_sibling names, descriptor = split_by_capital(segments) descriptor = ' '.join([d.text for d in descriptor]) split_courtfiles(qualifier_text, descriptor, names) else: """ Must also split on lines that aren't all caps """ splits = case_change_indices(segments) for seg in indexsplit(segments, *splits): add_persons(qualifier_text, seg) if next_qualifier and not qualifier_pattern.match( next_qualifier.text) and courtfile_num.match( next_qualifier.text): parties += [copy.deepcopy(party_dict)] court_files = [] while courtfile_num.match(next_qualifier.text): court_files += [next_qualifier.text] next_qualifier = next_qualifier.next_sibling parties[-1]['court-file'] = court_files return parties if found[0] else [], next_qualifier
def find_registry(soup): start = find_reg_el(soup, court_reg) registry = find_until(start, None, use_left=True) if registry: return [registry[-1].text]