def get_internal_references(self): references = [] sections = splt.split_section(self.statue, len(self.statue)) for section_id, section_span in enumerate(sections, 1): section_text = self.statue[section_span[0]:section_span[1]] paragraphs = splt.split_paragraph(section_text, len(section_text)) for paragraph_id, paragraph_span in enumerate(paragraphs, 1): paragraph_text = section_text[ paragraph_span[0]:paragraph_span[1]] for i, p in enumerate(pts.internal_pattern_order()): matches = re.findall(p, paragraph_text) if matches: tmp = [] for m in matches: if i == 0: tmp.append(shp.internal_ref(m[0], m[1], m[2])) if i == 1: tmp.append( shp.internal_ref(m[0], paragraph_id, m[1])) if i == 2: tmp.append(shp.internal_ref(m[0], m[1], '0')) if i == 3: tmp.append( shp.internal_ref(section_id, m[0], '0')) references.append(tmp) paragraph_text = re.sub(p, '', paragraph_text) return Counter(shp.flatten(references))
def get_external_references(self) -> List[Tuple[str, str, str]]: is_match, text_match, trimmed = splt.match_and_trim( self.statue, pts.journal()) references = [] is_match = True # Force while entry while (is_match): is_match, text_match, trimmed = splt.match_and_trim( trimmed, pts.external_reference()) if not is_match: is_match, text_match, trimmed = splt.match_and_trim( trimmed, pts.external_reference_ketless()) if not is_match: break text = splt.clean_external(text_match) by_year: List[Tuple[str, str]] = splt.split_year(text) for year in by_year: by_nr: List[str] = splt.split_nr(year[1]) for nr in by_nr: pos: List[str] = splt.split_pos(nr[1]) y = year[0] n = nr[0] if not re.match(r'\d{4}', y): y = '0000' if not re.match(r'\d+', n): n = '000' references.append((y, n, pos)) return shp.flatten(list(map(shp.flatten_references, references)))
def get_relations_from(synset_id): return query_wordnet('synsets/' + str(synset_id) + '/relations/from') def query_wordnet(url): resp = requests.get('http://api.slowosiec.clarin-pl.eu/plwordnet-api/' + url) return resp.json() #%% 1. Meanings of the 'szkoda' and display all their synonyms SenseDesc = namedtuple('sen', ['id', 'desc']) meanings = get_senses_from_word('szkoda') id_desc = [SenseDesc(r['id'], r['domain']['description']) for r in meanings] print(id_desc) syn_ids = shp.flatten([get_senses_from_synset(syn) for syn in (get_synset_from_sense(sen.id)['id'] for sen in id_desc)]) synonims = [syn['lemma']['word'] for syn in syn_ids] synonims #%% 2.1 Closure of hypernymy relation for the first meaning of the 'wypadek drogowy' sen_id = get_senses_from_word('wypadek drogowy')[0]['id'] syn_id = get_synset_from_sense(sen_id)['id'] ids = [syn_id] hypernymy = lambda synset: synset['relation']['id'] == 10 for id in ids: relation_ids = [rel['synsetTo']['id'] for rel in get_relations_from(id) if hypernymy(rel)] for i in relation_ids: if i not in ids: ids.append(i)
get_tokens = lambda text: list( map(lambda x: x['token'], filter(lambda x: x['type'] == '<ALPHANUM>', text) )) #%% 1.1 Compute bigram counts in the corpora resource_path = 'resources/ustawy' doc_shingles: List[List[str]] = [] doc_tokens: List[List[str]] = [] for filename in os.listdir(resource_path): with open(resource_path + '/' + filename, 'r') as document: legislation = document.read() doc = analyze(legislation)['tokens'] doc_shingles.append(get_shingles(doc)) doc_tokens.append(get_tokens(doc)) shingles = shapers.flatten(doc_shingles) tokens = shapers.flatten(doc_tokens) #%% text_only = lambda x: not re.search(r'\d+', x) make_tuple = lambda x: tuple(x.split(' ')) #%% 1.2 Filter shingles - text only, lowercase, no punctuation shingle_freq = Counter(list(map(make_tuple, filter(text_only, shingles)))) tokens_freq = Counter(list(filter(text_only, tokens))) #%% 2. Pointwise Mutal Information score p_t = lambda token: tokens_freq[token] / len(tokens_freq) p_s = lambda shingle: shingle_freq[shingle] / len(shingle_freq) pmi = lambda x, y: np.log(p_s((x, y)) / (p_t(x) * p_t(y)))
text_only = lambda x: not re.search(r'\d+', x) make_tuple = lambda x: tuple(x.split(' ')) #%% Bigram counts in the corpora resource_path = 'resources/ustawy' doc_shingles: List[List[str]] = [] doc_tokens: List[List[str]] = [] for filename in os.listdir(resource_path): with open(resource_path + '/' + filename, 'r') as document: legislation = document.read() doc = analyze(legislation)['tokens'] doc_shingles.append(get_shingles(doc)) doc_tokens.append(get_tokens(doc)) #%% shingles = map(make_tuple, filter(text_only, shapers.flatten(doc_shingles))) tokens = filter(text_only, shapers.flatten(doc_tokens)) #%% Morphosyntactic tagging - krnnt split = lambda x: x[1].split('\t')[1:3] twos = lambda x: len(x) == 2 tuple_split = lambda x: (x[0], x[1].split(':')[0]) composed = lambda a, b: a[1] == "subst" and b[1] == "subst" def krnnt(text): response = requests.post('http://localhost:9200', text.encode("utf-8")) \ .content.decode("utf-8") \ .split("\n") return list(map(tuple_split, filter(twos, map(split, shapers.pairs(response)))))[0] #%%