Example #1
0
    def get_internal_references(self):
        references = []
        sections = splt.split_section(self.statue, len(self.statue))
        for section_id, section_span in enumerate(sections, 1):
            section_text = self.statue[section_span[0]:section_span[1]]
            paragraphs = splt.split_paragraph(section_text, len(section_text))
            for paragraph_id, paragraph_span in enumerate(paragraphs, 1):
                paragraph_text = section_text[
                    paragraph_span[0]:paragraph_span[1]]
                for i, p in enumerate(pts.internal_pattern_order()):
                    matches = re.findall(p, paragraph_text)
                    if matches:
                        tmp = []
                        for m in matches:
                            if i == 0:
                                tmp.append(shp.internal_ref(m[0], m[1], m[2]))
                            if i == 1:
                                tmp.append(
                                    shp.internal_ref(m[0], paragraph_id, m[1]))
                            if i == 2:
                                tmp.append(shp.internal_ref(m[0], m[1], '0'))
                            if i == 3:
                                tmp.append(
                                    shp.internal_ref(section_id, m[0], '0'))
                        references.append(tmp)
                        paragraph_text = re.sub(p, '', paragraph_text)

        return Counter(shp.flatten(references))
Example #2
0
    def get_external_references(self) -> List[Tuple[str, str, str]]:
        is_match, text_match, trimmed = splt.match_and_trim(
            self.statue, pts.journal())
        references = []
        is_match = True  # Force while entry
        while (is_match):
            is_match, text_match, trimmed = splt.match_and_trim(
                trimmed, pts.external_reference())
            if not is_match:
                is_match, text_match, trimmed = splt.match_and_trim(
                    trimmed, pts.external_reference_ketless())
                if not is_match:
                    break

            text = splt.clean_external(text_match)
            by_year: List[Tuple[str, str]] = splt.split_year(text)
            for year in by_year:
                by_nr: List[str] = splt.split_nr(year[1])
                for nr in by_nr:
                    pos: List[str] = splt.split_pos(nr[1])
                    y = year[0]
                    n = nr[0]
                    if not re.match(r'\d{4}', y):
                        y = '0000'
                    if not re.match(r'\d+', n):
                        n = '000'
                    references.append((y, n, pos))

        return shp.flatten(list(map(shp.flatten_references, references)))
Example #3
0
def get_relations_from(synset_id):
	return query_wordnet('synsets/' + str(synset_id) + '/relations/from')

def query_wordnet(url):
	resp = requests.get('http://api.slowosiec.clarin-pl.eu/plwordnet-api/' + url)
	return resp.json()

#%% 1. Meanings of the 'szkoda' and display all their synonyms
SenseDesc = namedtuple('sen', ['id', 'desc'])

meanings = get_senses_from_word('szkoda')
id_desc = [SenseDesc(r['id'], r['domain']['description']) for r in meanings]
print(id_desc)

syn_ids = shp.flatten([get_senses_from_synset(syn) for syn in (get_synset_from_sense(sen.id)['id'] for sen in id_desc)])
synonims = [syn['lemma']['word'] for syn in syn_ids]
synonims

#%% 2.1 Closure of hypernymy relation for the first meaning of the 'wypadek drogowy'
sen_id = get_senses_from_word('wypadek drogowy')[0]['id']
syn_id = get_synset_from_sense(sen_id)['id']
ids = [syn_id]

hypernymy = lambda synset: synset['relation']['id'] == 10

for id in ids:
	relation_ids = [rel['synsetTo']['id'] for rel in get_relations_from(id) if hypernymy(rel)]
	for i in relation_ids:
		if i not in ids:
			ids.append(i)     
Example #4
0
get_tokens = lambda text: list(
    map(lambda x: x['token'], filter(lambda x: x['type'] == '<ALPHANUM>', text)
        ))

#%% 1.1 Compute bigram counts in the corpora
resource_path = 'resources/ustawy'
doc_shingles: List[List[str]] = []
doc_tokens: List[List[str]] = []
for filename in os.listdir(resource_path):
    with open(resource_path + '/' + filename, 'r') as document:
        legislation = document.read()
        doc = analyze(legislation)['tokens']
        doc_shingles.append(get_shingles(doc))
        doc_tokens.append(get_tokens(doc))

shingles = shapers.flatten(doc_shingles)
tokens = shapers.flatten(doc_tokens)

#%%
text_only = lambda x: not re.search(r'\d+', x)
make_tuple = lambda x: tuple(x.split(' '))

#%% 1.2 Filter shingles - text only, lowercase, no punctuation
shingle_freq = Counter(list(map(make_tuple, filter(text_only, shingles))))
tokens_freq = Counter(list(filter(text_only, tokens)))

#%% 2. Pointwise Mutal Information score
p_t = lambda token: tokens_freq[token] / len(tokens_freq)
p_s = lambda shingle: shingle_freq[shingle] / len(shingle_freq)
pmi = lambda x, y: np.log(p_s((x, y)) / (p_t(x) * p_t(y)))
Example #5
0
text_only = lambda x: not re.search(r'\d+', x)
make_tuple = lambda x: tuple(x.split(' '))

#%% Bigram counts in the corpora
resource_path = 'resources/ustawy'
doc_shingles: List[List[str]] = []
doc_tokens: List[List[str]] = []
for filename in os.listdir(resource_path):
    with open(resource_path + '/' + filename, 'r') as document:
        legislation = document.read()
        doc = analyze(legislation)['tokens']
        doc_shingles.append(get_shingles(doc))
        doc_tokens.append(get_tokens(doc))

#%%
shingles = map(make_tuple, filter(text_only, shapers.flatten(doc_shingles)))
tokens = filter(text_only, shapers.flatten(doc_tokens))

#%% Morphosyntactic tagging - krnnt
split = lambda x: x[1].split('\t')[1:3]
twos = lambda x: len(x) == 2
tuple_split = lambda x: (x[0], x[1].split(':')[0])
composed = lambda a, b: a[1] == "subst" and b[1] == "subst"

def krnnt(text):
    response = requests.post('http://localhost:9200', text.encode("utf-8")) \
            .content.decode("utf-8") \
            .split("\n")
    return list(map(tuple_split, filter(twos, map(split, shapers.pairs(response)))))[0]

#%%