def tokenize(data): for text in data['items']: if not reader.in_2008(text): continue txt = re.sub(r'<[^>]*>', '', text['textContent']) txt = re.sub(r'-\n', '', txt) found = re.findall(r'\b[a-zA-ZąęłćżźóńśŚĄĘŁĆŻŹÓŃ]+\b', txt) found = [x.lower() for x in found] words.extend(found)
def tag_judgment(data): global count for text in data['items']: if not reader.in_2008(text): continue count += 1 response = requests.post(url, text['textContent'].encode('utf-8')) with open('tagged.txt', 'a') as f: f.writelines(response.text)
def find_references(data): for item in data['items']: if not reader.in_2008(item): continue for reference in item['referencedRegulations']: found = re.findall( r'(\bUstawa\b\s+\bz\b\s+\bdnia\b\s+\b23\b\s+\bkwietnia\b\s+\b1964\b\s+r\.\s+-\s+\bKodeks\b\s+\bcywilny\b)' r'[\S\s]+(\bart.\s+\b445\b)', reference['text']) f.extend(found)
def find_szkoda(data): for text in data['items']: if not reader.in_2008(text): continue found = re.findall(r'\b[Ss]zk[oó]d\w{0,3}\b', text['textContent']) for word in found: if proper(word): words.append(word) break
def find_values(data): for text in data['items']: if not reader.in_2008(text): continue # liczba taka jak '1203120' lub '123,1541.123,11321' lub '123 2100 3210 3210' # jednostka 'mld' | 'mln' | 'tys' lub kwota slownie '(kwota slownie)' lub stare w postaci: '(starych|stare) lub 'starych|stare' wystepujace max 3 razy # zlotych lub zlote na samym koncu found = re.findall( r'((\b\d+[\d,]*\b)|(\b\d+\s?[\d.]*,?\d{2}?\b)|(\b\d+[\d\s]+,?\d{2}?\b))\s?' r'((\bmld\b\s?)|(\bmln\b\s?)|(\btys\b\s?)|(\b\(?star\w{1,3}\)?\b\s?)|(\([\w\s]+\)\s?)){0,3}' r'((\bzłot\w{1,3}\b)|(\bzł\.?\b))', text['textContent']) for matched in found: value = parse_value(matched) money.append(value)
def save_in_es(data): global id for text in data['items']: if not reader.in_2008(text): continue judges = [] for judge in text['judges']: judges.append(judge['name']) judgment_doc = { 'text': text['textContent'], 'judgmentDate': parse(text['judgmentDate']), 'caseNumber': text['courtCases'][0]['caseNumber'], 'judges.name': judges } print(judges) res = es.index(index='lab', doc_type='judgment', body=judgment_doc, id=id) print(res) id += 1
def load_judgments(data): for jsondoc in data['items']: if not reader.in_2008(jsondoc): continue all_jugdments.append(jsondoc)