def add_new_meta(meta: Metadata): """ If file to added meta Назив прописа # ELI#Напомена издавача#Додатне информације#Врста прописа#Доносилац#Област#Група#Датум усвајања#Гласило и датум објављивања#Датум ступања на снагу основног текста#Датум примене#Правни претходник#Издавач#filename#Верзија на снази од#Почетак примене верзије#Број акта :param meta: :return: None, writes in file meta """ file_meta = open(utilities.get_root_dir() + "/data/meta/allmeta.csv", mode="a") deli = "#" new_line = meta.act_name + deli + meta.eli + deli + meta.napomena_izdavaca + deli + meta.dodatne_informacije + deli + meta.vrsta_propisa + deli + meta.donosilac + deli + meta.oblast + deli + meta.grupa + deli + meta.datum_usvajanja + deli + meta.glasilo_i_datum + deli + meta.datum_stupanja + deli + meta.pravni_prethodnik + deli + meta.izdavac + deli + meta.filename + "\n" file_meta.write(new_line) file_meta.close()
def generate_owl(folder_path, filenames=None): result = get_tf_idf_values_document(folder_path, filenames=filenames, return_just_words=False) stat_file = open("stats.txt", "w", encoding="utf-8") for el in result: s_file = el[0] f = open(folder_path + "\\" + s_file, "r", encoding="utf-8") info = "".join(f.readlines()) # for q in el[1]: # print(q) stat_len = len(re.findall(r'\w+', info)) clans_data = util.from_content_to_act_list(info) clan_info = util.gather_clans(info) # Otvori fajl, pronađe strukture, generišu clanovi, dodaju se # print(clans_data) meta = utilities.get_meta(check_meta(s_file), utilities.get_root_dir() + "\\data\\meta\\allmeta.csv") if meta == None: print("Warn - " + el[0] + "missing meta") continue latin_name = to_latin(meta.act_name).replace(' ', '_') # latin_name = meta.act_name.replace(" ", '_') dis = {} curr_zakon = owl.add_legal_resource(latin_name) add_meta_to_act(curr_zakon, meta) i = 0 for info in clan_info: curr_sub = owl.add_legal_sub(latin_name.split(":")[0] + '_' + info.replace(' ', '_')) is_about = inside_important(el[1], clans_data, i) for new_concept in is_about: if new_concept not in dis: dis[new_concept] = owl.add_concept(new_concept) if is_about.__len__ != 0: curr_sub.is_about = [dis[s] for s in is_about] curr_sub.is_part_of = [curr_zakon] i = i + 1 curr_zakon.is_about = [dis[s] for s in dis] write_stat_info(curr_zakon, dis, stat_file, stat_len) s_file.close() owl.save()
def apply_akn_tags(text: str, meta_name: str, skip_tfidf_ner=False, ner="crf", meta_data=None): """ Applies to text Akoma Ntoso 3.0 tags for Republic of Serbia regulations :param text: HTML or plain text :param meta_name: name which was meta added in file, 15 tag in meta, use function in MetadataBuilder.add_new_meta or add manually in Akoma/data/meta/allmeta.csv :param skip_tfidf_ner: Don't add references> TLCconcept for document and TLC for ner, if true speeds up execution by a lot :param ner: chooses model which will be used, can be one of values: 'crf','spacy','spacy_default','reldi', crf best so far but slowest :param meta_data: type form_akoma/Metadata.py, if this is passed meta_name is not important, because file allmeta.csv is not searched and all data passed in meta_data is used for meta_data :return: Labeled xml string """ global ner_list akoma_root = init_akoma.init_xml("act") repaired = False if text.find("<") == -1: repaired = True else: text = regex_patterns.strip_html_tags_exept(text) if not repaired: try: html_root = ET.fromstring("<article>" + text + "</article>") except Exception as e: got = BeautifulSoup(text, "lxml") text = got.prettify().replace("<html>", "").replace( "</html>", "").replace("<body>", "").replace("</body>", "") html_root = ET.fromstring("<article>" + text + "</article>") metabuilder = MetadataBuilder("data/meta/allmeta.csv") metabuilder.build(meta_name, akoma_root, skip_tfidf_ner) # print(ETree.prettify(akoma_root)) builder = AkomaBuilder(akoma_root) if not repaired: reasoner = BasicReasoner(HTMLTokenizer(html_root), builder) else: reasoner = BasicReasoner(BasicTokenizer(text), builder) reasoner.start(metabuilder) if reasoner.current_hierarchy[4] == 0: akoma_root = init_akoma.init_xml("act") metabuilder = MetadataBuilder("data/meta/allmeta.csv") if meta_data is None: metabuilder.build(fajl, akoma_root, skip_tfidf=skip_tfidf_ner) else: metabuilder.build(fajl, akoma_root, skip_tfidf=skip_tfidf_ner, passed_meta=meta_data) builder = AkomaBuilder(akoma_root) if not repaired: reasoner = OdlukaReasoner(HTMLTokenizer(html_root), builder) else: reasoner = OdlukaReasoner(BasicTokenizer(text), builder) reasoner.start(metabuilder) result_str = builder.result_str().replace("<", "~vece;").replace( ">", "~manje;").replace(""", "~navod;") if not skip_tfidf_ner: send_to_NER(akoma_root) if ner == "crf": map_ret = do_ner_on_sentences(ner_list) elif ner == "spacy": map_ret = do_spacy_ner(ner_list, custom=True) elif ner == "spacy_default": map_ret = do_spacy_ner(ner_list, custom=False) elif ner == "reldi": map_ret = {} print("Waiting for access to reldi NER from devs, TODO for future") exit(-1) if ner == "crf" or ner == "spacy" or ner == "spacy_default" or ner == "reldi": fix_dates(map_ret) events = utilities.regex_events( regex_patterns.strip_html_tags(text)) utilities.entities_add_date(map_ret, events) # Regex adding dates add_ner_tags(map_ret, akoma_root, metabuilder) ner_list.clear() try: result_stablo = add_refs(akoma_root, result_str, metabuilder.uri_expression) except Exception as e: file_ref_exeption = open(utilities.get_root_dir() + "/data/" + "za_ninu.txt", mode="a+") file_ref_exeption.write(meta_name + ":" + str(e) + "\n") file_ref_exeption.close() return result_str result_str = ETree.prettify(result_stablo).replace("<", "<") \ .replace(">", ">").replace(""", "\"").replace('<references source="#somebody"/>', "") result_str = result_str.replace("~vece;", ">").replace( "~manje;", "<").replace("~navod;", """) return result_str
if __name__ == "__main__": nastavi = "1.html" only_annotated = True # just do annotated files idemo = False stani = [ "1005.html", "980.html", "986.html", "981.html", "210.html", "1033.html" # problematicni PROVERITI 176 , "180.html" ] # Veliki fajlovi location_source = utilities.get_root_dir() + "/data/acts" annotated_source = utilities.get_root_dir() + "/data/annotated" fajls = utilities.sort_file_names(os.listdir(location_source)) if only_annotated is True: fajls = utilities.sort_file_names(os.listdir(annotated_source)) fajls = [el.replace(".xml", ".html") for el in fajls] idemo = True for fajl in fajls: if fajl == nastavi: idemo = True if not idemo: continue if fajl in stani: continue # if fajl != "2.html":
except FileNotFoundError: print(">Error tf-idf FileNotFoundError:" + check) continue all_lines = "".join(file.readlines()) list_words = get_tf_idf_values_from_text(all_lines, return_just_words=return_just_words, threshold=threshold, max_elements=max_elements, latin=latin, debug=debug) if with_file_names: results.append([filename, list_words]) else: results.append(list_words) if debug: print(results[len(results) - 1]) return results if __name__ == '__main__': # filenames , folderPath = get_file_names("data", "aktovi_raw_lat") filenames = ["1.html", "2.html"] path_folder = utilities.get_root_dir().replace("\\", "/") + "/data/acts" tf_idf_values = get_tf_idf_values_document(path_folder, filenames=filenames, return_just_words=False, with_file_names=True, latin=False) got_file = open(path_folder + "/" + filenames[0], mode="r", encoding="utf-8") text = "".join(got_file.readlines()) #tf_idf_val2 = get_tf_idf_values_from_text(text, return_just_words=True, latin=False) #print(tf_idf_val2) print(tf_idf_values) for el in tf_idf_values: print([item[0] for item in el]) # FILES if return file names also print([item[1] for item in el]) # WORDS if return file names also
from os import path try: from Akoma.utilities.utilities import get_root_dir except ModuleNotFoundError: try: from utilities.utilities import get_root_dir except ModuleNotFoundError: print("Error in modules") exit(-1) cls_legal_resource = "LegalResource" cls_legal_resource_sub = "LegalResourceSubdivision" p_is_about = "is_about" pather = path.dirname(__file__) onto_path = get_root_dir() + "\\semanticki\\" onto = get_ontology(onto_path + "eli.rdf") onto.load() skos = onto.get_namespace("http://www.w3.org/2004/02/skos/core") concept_class = [s for s in onto.Language.ancestors() if s.name == "Concept"][0] def save(): onto.save("output.rdf") def add_instance(class_name, instance_name): return eval("onto.{0}('{1}')".format(class_name, instance_name))
add_meta_to_act(curr_zakon, meta) i = 0 for info in clan_info: curr_sub = owl.add_legal_sub(latin_name.split(":")[0] + '_' + info.replace(' ', '_')) is_about = inside_important(el[1], clans_data, i) for new_concept in is_about: if new_concept not in dis: dis[new_concept] = owl.add_concept(new_concept) if is_about.__len__ != 0: curr_sub.is_about = [dis[s] for s in is_about] curr_sub.is_part_of = [curr_zakon] i = i + 1 curr_zakon.is_about = [dis[s] for s in dis] write_stat_info(curr_zakon, dis, stat_file, stat_len) s_file.close() owl.save() if __name__ == '__main__': from os import listdir from os.path import isfile, join base_path = utilities.get_root_dir() folder_path = base_path + "/data/lat_acts" only_files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))] ordered = [str(el) + ".txt" for el in range(1, 200)] generate_owl(folder_path, filenames=ordered) # onlyfiles[:10]) filenames=["86.txt", "200.txt"])