def reload(self): cdr_file = get_data_file("named.rdr", folder="vietnamese") self.__root = SCRDRTree() self.__root.constructSCRDRtreeFromRDRfile(cdr_file) crf_file = get_data_file("ner.crf.bin", folder="models") if not path.isfile(crf_file): logging.error("Model %s not found " % crf_file) self.__crf: pycrfsuite.Tagger = pycrfsuite.Tagger() self.__crf.open(crf_file) if self.__debug: logging.info("Labels in model : %s" % str(self.__crf.labels()))
def __init__(self, debug=False): cdr_file = get_data_file("named.rdr", folder="vietnamese") self.__root = SCRDRTree() self.__root.constructSCRDRtreeFromRDRfile(cdr_file) crf_file = get_data_file("ner.crf.bin", folder="models") if not path.isfile(crf_file): logging.error("Model %s not found " % crf_file) print("Model %s not found " % crf_file) exit() self.__crf: pycrfsuite.Tagger = pycrfsuite.Tagger() self.__crf.open(crf_file) self.__nlp = None self.__debug = debug self.__adapter: DocFeatures = DocFeatures() logging.info("Labels in model(Semi_Supervised_Doc_Ner) : %s" % str(self.__crf.labels()))
def get_middle_names(self) -> set: if self.__middle_names is None: filename = get_data_file("middle_names.txt", folder="vietnamese") logging.info("%s loaded" % filename) arr = [normalize(x) for x in readlines(filename)] self.__middle_names = set(list(dict.fromkeys(arr))) return self.__middle_names
def add_given_name(self, word): # add to vocabulary_build: x = normalize(word) self.get_vn_dict() if x not in self.__given_names: filename = get_data_file("names.txt", folder="vocals") self.write_append(word=word, filename=filename) self.__given_names.add(x)
def add_hard_dict(self, word): # add to vocabulary_build: x = normalize(word) self.get_hard_dict() if x not in self.__hard_dict: filename = get_data_file("hard_dict.txt", folder="vietnamese") self.write_append(word=word, filename=filename) self.__hard_dict.add(x)
def get_short_word(self) -> set: if self.__short_word is None: filename = get_data_file("short_words.txt", folder="vietnamese") logging.info("%s loaded" % filename) arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__short_word = set(list(dict.fromkeys(arr))) return self.__short_word
def add_custom_dict_vn(self, word): # add to vocabulary_build: x = normalize(word) self.get_vn_dict() if x not in self.__vn_dict: # filename = path.join(self.__cur_dir, "vocabulary_build.txt") filename = get_data_file("vocabulary_build.txt", folder="vocals") self.write_append(word=word, filename=filename) self.__vn_dict.add(x)
def get_family_names(self) -> set: if self.__family_names is None: filename = get_data_file("family_names.txt", folder="vietnamese") # filename = path.join(conf.vietnamese_path, "family_names.txt") arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__family_names = set(list(dict.fromkeys(arr))) return self.__family_names
def _load(self): logging.info("{}.load()".format(self.__class__.__name__)) filename = get_data_file("entity_named.dic.bin", folder="vocabulary") if not path.isfile(filename): logging.error("Please check {}".format(filename)) exit() dct = load_from_file(filename) # logging.info("DictionaryLoader load (path = %s)" % filename) self.__root = dct.get("root") self.__start_sylls = dct.get("start_sylls") self.__length = dct.get("length") pros_map = dct.get("pros_map") self.__pros_map = {pros_map.get(K): K for K in pros_map} logging.info("{}.done(length={},map={})".format(self.__class__.__name__, self.__length, self.__pros_map))
def get_given_name(self) -> set: if self.__given_names is None: files = {"names.txt", "company.txt"} arr = [] for f in files: filename = get_data_file(f, folder="vocals") logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) self.__given_names = set(arr) return self.__given_names
def __load(self): # logging.info(">> set(initialize)") # reset self.__vn_dict = None self.__short_word = None self.__location = None self.__location_lv_3 = None self.__vn_dict_ugram = None self.__first_sent_word = None self.__family_names = None self.__middle_names = None self.__max_ugram = 4 self.__given_names = None self.__hard_dict = None # filename = get_data_file("word_tokenlizer.bin", folder="vocabulary") if not path.isfile(filename): logging.error("%s not found. please check data..." % filename) exit() dct = load_from_file(filename) # # self.__hard_dict = dct.get("hard_dict") # logging.info("Size of (self.__hard_dict): %s" % len(self.__hard_dict)) # max_ugram, vn_dict_ugram = dct.get("vn_dict_ugram") self.__max_ugram = max_ugram self.__vn_dict_ugram = vn_dict_ugram logging.info("Size of (self.__vn_dict_ugram): %s" % len(self.__vn_dict_ugram)) # self.__vn_dict = dct.get("vn_dict") logging.info("Size of (self.__vn_dict): %s" % len(self.__vn_dict)) # self.__given_names = dct.get("given_names") logging.info("Size of (self.__given_names): %s" % len(self.__given_names)) # self.__geo = dct.get("geo") logging.info("Size of (self.__geo): %s" % len(self.__geo)) # self.__location = dct.get("location") logging.info("Size of (self.__location): %s" % len(self.__location)) # self.__tên_riêng = dct.get("name_vn") logging.info("Size of (self.__tên_riêng): %s" % len(self.__tên_riêng))
def get_vn_dict(self) -> set: if self.__vn_dict is None: files = { "vocabulary_standard.txt", "vocabulary_build.txt", "vocabulary.txt", "animal.txt", } arr = [] for f in files: filename = get_data_file(f, folder="vocals") logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) self.__vn_dict = set(arr) return self.__vn_dict
def get_hard_dict(self): if self.__hard_dict is None: filename = get_data_file("hard_dict.txt", folder="vietnamese") logging.info("%s loaded" % filename) dic, sizeof = dict(), dict() arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] for x in arr: sl = x.split() key = " ".join(sl[0:2]) if key not in dic: dic[key] = set([]) sizeof[key] = 0 dic[key].add(x) if len(sl) > sizeof[key]: sizeof[key] = len(sl) self.__hard_dict = dict() for i in dic: self.__hard_dict[i] = dict(max_len=sizeof.get(i), data=dic.get(i)) return self.__hard_dict
def get_phó_từ(): filename = get_data_file("photu.txt", folder="vietnamese/dictionary") return data_loader(filename)
def get_tu_tieng_nuoc_ngoai_thong_dung(): filename = get_data_file("tu_tieng_nuoc_ngoai_thong_dung.txt", folder="vietnamese") return data_loader(filename)
def get_tu_don_chi_nam_trong_tu_ghep(): filename = get_data_file("tu_don_chi_nam_trong_tu_ghep.txt", folder="vietnamese") return data_loader(filename)
def get_english_words(): filename = get_data_file("english.txt") ss = readlines(filename) ss = [x.strip().lower() for x in ss] return ss
def get_stop_words(): filename = get_data_file("stopwords.txt", folder="vietnamese") return data_loader(filename)
def get_âm_tiết_việt_nam(): filename = get_data_file("vietnamese-syllables.txt", folder="vietnamese") ss = readlines(filename) ss = [x.strip().lower() for x in ss] return ss
def reload_rdr(self): cdr_file = get_data_file("named.rdr", folder="vietnamese") self.__root = SCRDRTree() self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
def get_tính_từ(): filename = get_data_file("tinhtu.txt", folder="vietnamese/dictionary") return data_loader(filename)
def get_âm_tiết_đặt_tên_thông_dụng(): # get_data_file filename = get_data_file("syll_names.txt", folder="vietnamese") return data_loader(filename)
def get_đại_từ_nhân_xưng(): filename = get_data_file("danhtunhanxung.txt", folder="vietnamese/dictionary") return data_loader(filename)
def get_giới_từ(): filename = get_data_file("gioi_tu.txt", folder="vietnamese/dictionary") return data_loader(filename)
def get_liên_từ(): filename = get_data_file("lientu.txt", folder="vietnamese/dictionary") return data_loader(filename)
def reload(self): cdr_file = get_data_file("word_tokenlize.rdr", folder="vietnamese") self.__root = SCRDRTree() self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
def get_động_từ(): filename = get_data_file("dongtu.txt", folder="vietnamese/dictionary") return data_loader(filename)