def calculate_acc_on_testfile(file_path): model = AnalysisScorerModel.create_from_existed_model( model_name="lookup_disambiguator_wo_suffix") test_data = data_generator(file_path, add_gold_labels=True, case_sensitive=True) corrects = 0 total = 0 with open("data/incorrect_analyzes.csv", "w", encoding="UTF-8") as f: f.write("Surface\tGold\tPredicted\n") for sentence in test_data: predicted_indexes = model.predict_indices(sentence) for word, selected_index in zip(sentence, predicted_indexes): gold_analysis = word.roots[0] + "+" + "+".join(word.tags[0]) gold_analysis = gold_analysis.replace("+DB", "^DB") selected_analysis = word.roots[ selected_index] + "+" + "+".join(word.tags[selected_index]) selected_analysis = selected_analysis.replace("+DB", "^DB") if to_lower(selected_analysis) == to_lower(gold_analysis): corrects += 1 else: f.write("{}\t{}\t{}\n".format(word.surface_word, gold_analysis, selected_analysis)) total += 1 print("Accuracy: {}".format(corrects * 1.0 / total))
def parse_answers(self, offset: int, lst: list, cnt: int): for _ in range(cnt): a_name, offset = self.get_name(self.data, offset) a_type, a_class, a_ttl, a_data_len = unpack( "!HHIH", self.data[offset: offset + 10]) a_data, offset = self.parse_a_data( a_data_len, a_type, self.data, offset + 10) if a_type == DnsMessage.NS: a_data = to_lower(a_data) answer = DnsAnswer(to_lower(a_name), a_type, a_class, a_ttl, a_data_len, a_data) lst.append(answer) return offset
def __init__(self, data: bytes): lst = unpack("!HHHHHH", data[:12]) self.isResponse = lst[1] & int('1000000000000000', 2) >> 15 self.recursion = lst[1] & int('0000001000000000', 2) >> 9 self.id = lst[0] self.question_count = lst[2] self.answer_count = lst[3] self.authority_count = lst[4] self.additional_count = lst[5] self.questions = [] self.answers = [] self.authority = [] self.additional = [] self.data = copy.copy(data) self.questions_offset = 12 offset = 12 for _ in range(self.question_count): q_name, offset = self.get_name(self.data, offset) self.questions_name_end = offset q_type, q_class = unpack("!HH", self.data[offset: offset + 4]) question = DnsQuery(to_lower(q_name), q_type, q_class) self.questions.append(question) offset += 4 self.answers_offset = offset self.authority_offset = self.parse_answers( offset, self.answers, self.answer_count) self.additional_offset = self.parse_answers( self.authority_offset, self.authority, self.authority_count) self.parse_answers(self.additional_offset, self.additional, self.additional_count)
def predict(self, tokens): sentence = [] for token in tokens: token = to_lower(token) candidate_analyzes = self.candidate_generator.get_analysis_candidates( token) roots = [] tags = [] for analysis in candidate_analyzes: roots.append(analysis[0]) tags.append(analysis[2]) sentence.append(WordStruct(token, roots, [], tags, 0)) selected_indices = self.predict_indices(sentence) res = [] for i, j in enumerate(selected_indices): if "Prop" in sentence[i].tags[j]: sentence[i].roots[j] = capitalize(sentence[i].roots[j]) if sentence[i].tags[j] == "Unknown": selected_analysis = sentence[i].roots[j] + "+" + sentence[ i].tags[j] else: selected_analysis = sentence[i].roots[j] + "+" + "+".join( sentence[i].tags[j]) selected_analysis = selected_analysis.replace("+DB", "^DB") res.append(selected_analysis) return res
def pre_process(df_data, args): #drop specific columns from the analysis df_data = utils.reduce_data(df_data) #get the columns in data frame columns = list(df_data.columns.values) list_categories = [] for column in columns: print("procesando variable: ", column) #get levels and frecuency of each one per column levels = df_data[column].value_counts().index.tolist() frecuency = df_data[column].value_counts().values.tolist() #check if the colum contains binar data like si, or no: string#¢|@. Return boolean binar = utils.check_binar( utils.to_lower(df_data[column].value_counts().index.tolist())) #reduce levels by frecuency with the argument tol. #if a level in the column doesn't have meet the criterium frecuency/len_total_data is ignored levels, frecuency = utils.reduce_levels(levels, frecuency, df_data.shape[0], tol=args.tol) levels = list(set(levels)) #process every colum like binar o with levels clean_column(df_data, unicode(column), levels, binar) print return df_data
def suffix_transform_single(cls, candidate_suffix): candidate_suffix = to_lower(candidate_suffix) candidate_suffix = cls.SUFFIX_TRANSFORMATION_REGEX1.sub( "A", candidate_suffix) candidate_suffix = cls.SUFFIX_TRANSFORMATION_REGEX2.sub( "H", candidate_suffix) return candidate_suffix
def get_stem_suffix_candidates(self, surface_word): candidate_roots = [] candidate_suffixes = [] for i in range(1, len(surface_word)): candidate_root = surface_word[:i] candidate_suffix = surface_word[i:] if not self.case_sensitive: candidate_root = to_lower(candidate_root) candidate_suffix = to_lower(candidate_suffix) self._add_candidate_stem_suffix(candidate_root, candidate_suffix, candidate_roots, candidate_suffixes) else: candidate_suffix = to_lower(candidate_suffix) self._add_candidate_stem_suffix(to_lower(candidate_root), candidate_suffix, candidate_roots, candidate_suffixes) if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( candidate_root): self._add_candidate_stem_suffix(capitalize(candidate_root), candidate_suffix, candidate_roots, candidate_suffixes) candidate_suffixes.append("") candidate_roots.append(to_lower(surface_word)) if self.case_sensitive and TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( surface_word): candidate_suffixes.append("") candidate_roots.append(capitalize(surface_word)) assert len(candidate_roots) == len(candidate_suffixes) TurkishStemSuffixCandidateGenerator._root_transform(candidate_roots) if self.asciification: candidate_roots = [ asciify(candidate_root) for candidate_root in candidate_roots ] candidate_suffixes = [ asciify(candidate_suffix) for candidate_suffix in candidate_suffixes ] if self.suffix_normalization: TurkishStemSuffixCandidateGenerator.suffix_transform( candidate_suffixes) return candidate_roots, candidate_suffixes
def evaluate_candidate_generation(file_path, max_words=0, case_sensitive=True): candidate_generator = TurkishStemSuffixCandidateGenerator( case_sensitive=case_sensitive) result = [] with open(file_path, "r", encoding="UTF-8") as f: for i, line in enumerate(f): if i % 1000 == 0: print("Line: {}".format(i)) if 0 < max_words < i: break trimmed_line = line.strip(" \r\n\t") if trimmed_line.startswith("<"): continue else: parses = re.split(r"[\t ]", trimmed_line) surface = parses[0] candidates = candidate_generator.get_analysis_candidates( surface) roots = [] suffixes = [] tags = [] analyzes = parses[1:] gold_root = get_root_from_analysis(analyzes[0]) if not case_sensitive: gold_root = to_lower(gold_root) gold_tag = convert_tag_list_to_str( standardize_tags(get_tags_from_analysis(analyzes[0]))) does_contain = False for candidate_root, candidate_suffix, candidate_tag in candidates: roots.append(candidate_root) suffixes.append(candidate_suffix) tags.append(convert_tag_list_to_str(candidate_tag)) if candidate_root == gold_root and convert_tag_list_to_str( candidate_tag) == gold_tag: does_contain = True if not does_contain: if gold_root in roots: correct_root_candidate = gold_root else: correct_root_candidate = "Not Exist" if gold_root in roots: found_analyzes = "\n".join(tags) else: found_analyzes = "" result.append({ "Surface Word": surface, "Gold root": gold_root, "Gold Tags": gold_tag, "Selected root candidate": correct_root_candidate, "Found Tag Sequences": found_analyzes }) df = pd.DataFrame(result, index=None, columns=[ "Surface Word", "Gold root", "Gold Tags", "Selected root candidate", "Found Tag Sequences" ]) df.to_excel("Candidate Generation Error Analysis.xlsx")
def read_stem_list(self): with open(TurkishStemSuffixCandidateGenerator.STEM_LIST_FILE_PATH, "r", encoding="UTF-8") as f: for line in f: splits = line.strip().split("\t") stem = splits[0] if not self.case_sensitive: stem = to_lower(stem) flag = int(splits[1].strip()) postags = TurkishStemSuffixCandidateGenerator._parse_flag(flag) if stem in self.stem_dic: self.stem_dic[stem] = list( set(list(postags) + self.stem_dic[stem])) else: self.stem_dic[stem] = postags
def process_column(list_data, column_name, args): categories = set(list_data) categories = list(utils.to_lower(list(categories))) print("low categories", categories) binar = utils.check_binar(categories) if binar: print("binar category") binar_data = simple_binarization(list_data, categories, column_name) else: print("multivariate category") binar_data = binar_column(list_data, categories, column_name) return binar_data
def create_json_categories(df_data, path): #get the columns in data frame columns = list(df_data.columns.values) list_categories = [] for column in columns: #get levels per column levels = df_data[column].value_counts().index.tolist() frecuencies = df_data[column].value_counts().values.tolist() #check if the column is a binar category binar = utils.check_binar(utils.to_lower(df_data[column].value_counts().index.tolist())) #create dict of levels and categories dict_levels = create_column_json(levels, frecuencies, column, binar) #add every item in the dict to a list append_data_dict(list_categories, dict_levels) #print("list categories",list_categories) with open(path + "/Data/categories.json", 'w') as f: f.write(json.dumps(list_categories, f))
def __init__(self, string, opinions): self.string = string self.opinions = opinions self.tokens = utils.filter_symbol(utils.to_lower(tokenize(string))) self.ids = None
def get_analysis_candidates(self, surface_word): if to_lower(surface_word) in self.exact_lookup_table: cur_candidate_analyzes = [] analyzes = self.exact_lookup_table[to_lower(surface_word)] for analysis in analyzes: suffix = analysis.split("/")[0] root = TurkishStemSuffixCandidateGenerator.TAG_SEPARATOR_REGEX.split( analysis.split("/")[1])[0] tags = TurkishStemSuffixCandidateGenerator.TAG_SEPARATOR_REGEX.split( analysis.split("/")[1])[1:] cur_candidate_analyzes.append((root, suffix, tags)) return cur_candidate_analyzes candidate_analyzes = [] candidate_analyzes_str = [] candidate_roots, candidate_suffixes = self.get_stem_suffix_candidates( surface_word) for candidate_root, candidate_suffix in zip(candidate_roots, candidate_suffixes): if TurkishStemSuffixCandidateGenerator.NON_WORD_REGEX.match( candidate_root): if TurkishStemSuffixCandidateGenerator.CONTAINS_NUMBER_REGEX.match( candidate_root): stem_tags = ["Num", "Noun+Time"] else: stem_tags = ["Punc"] elif len(candidate_suffix ) == 0 and candidate_root not in self.stem_dic: # stem_tags = ["Noun", "Noun+Prop"] continue elif candidate_root not in self.stem_dic: if "'" in candidate_suffix and candidate_suffix in self.suffix_dic: stem_tags = ["Noun+Prop"] else: continue else: stem_tags = self.stem_dic[candidate_root] if not TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(candidate_root)\ and "Noun+Prop" in stem_tags: \ stem_tags.remove("Noun+Prop") elif TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(candidate_root) \ and "Noun+Prop" in stem_tags: stem_tags = ["Noun+Prop"] elif candidate_suffix.startswith("'") and candidate_suffix in self.suffix_dic \ and "Noun+Prop" in stem_tags: stem_tags = ["Noun+Prop"] elif TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( candidate_root): continue candidate_tags = self.get_tags(candidate_suffix, stem_tags) cur_candidate_analyzes = [] for candidate_tag in candidate_tags: if to_lower(candidate_root) + "+" + "+".join( candidate_tag).replace( "+DB", "^DB") not in candidate_analyzes_str: cur_candidate_analyzes.append( (to_lower(candidate_root), candidate_suffix, candidate_tag)) candidate_analyzes_str.append( to_lower(candidate_root) + "+" + "+".join(candidate_tag).replace("+DB", "^DB")) candidate_analyzes += cur_candidate_analyzes if len(candidate_analyzes) == 0: candidate_analyzes.append((to_lower(surface_word), "", "Unknown")) return candidate_analyzes
def load_data(file_path, max_sentence=0, add_gold_labels=True, case_sensitive=False): sentences = [] sentence = [] candidate_generator = TurkishStemSuffixCandidateGenerator( case_sensitive=case_sensitive) with open(file_path, "r", encoding="UTF-8") as f: for i, line in enumerate(f): if 0 < max_sentence < i: break trimmed_line = line.strip(" \r\n\t") trimmed_line = trimmed_line.replace("s", "s") if trimmed_line.startswith("<S>") or trimmed_line.startswith( "<s>"): sentence = [] elif trimmed_line.startswith("</S>") or trimmed_line.startswith( "</s>"): if len(sentence) > 0: sentences.append(sentence) elif len( trimmed_line ) == 0 or "<DOC>" in trimmed_line or trimmed_line.startswith( "</DOC>") or trimmed_line.startswith( "<TITLE>") or trimmed_line.startswith("</TITLE>"): pass else: parses = re.split(r"[\t ]", trimmed_line) surface = parses[0] candidates = candidate_generator.get_analysis_candidates( surface) roots = [] suffixes = [] tags = [] ambiguity_level = 0 if add_gold_labels: analyzes = parses[1:] ambiguity_level = len(analyzes) gold_root = get_root_from_analysis(analyzes[0]) gold_root = to_lower(gold_root) roots.append(gold_root) gold_suffix = surface[len(gold_root):] if not case_sensitive: gold_suffix = to_lower(gold_suffix) suffixes.append(gold_suffix) gold_tag = standardize_tags( get_tags_from_analysis(analyzes[0])) tags.append(gold_tag) for candidate_root, candidate_suffix, candidate_tag in candidates: if to_lower(candidate_root) != to_lower( gold_root) or "".join( candidate_tag) != "".join(gold_tag): roots.append(to_lower(candidate_root)) suffixes.append(candidate_suffix) tags.append(candidate_tag) elif candidate_suffix != gold_suffix and candidate_root == gold_root: suffixes[0] = candidate_suffix else: for candidate_root, candidate_suffix, candidate_tag in candidates: roots.append(candidate_root) suffixes.append(candidate_suffix) tags.append(candidate_tag) if len(roots) == 0: if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match( surface): candidate_tags = candidate_generator.get_tags( "", stem_tags=["Noun", "Noun+Prop"]) else: candidate_tags = candidate_generator.get_tags( "", stem_tags=["Noun"]) for candidate_tag in candidate_tags: if "Prop" in candidate_tag: roots.append(capitalize(surface)) suffixes.append("") tags.append(candidate_tag) else: roots.append(to_lower(surface)) suffixes.append("") tags.append(candidate_tag) if not case_sensitive: surface = to_lower(surface) roots = [to_lower(root) for root in roots] suffixes = [to_lower(suffix) for suffix in suffixes] current_word = WordStruct(surface, roots, suffixes, tags, ambiguity_level) sentence.append(current_word) return sentences
def to_lower(token): """ Token to lower case. """ return utils.to_lower(token)
def _add_candidate_stem_suffix(stem_candidate, suffix_candidate, candidate_roots, candidate_suffixes): if "'" in suffix_candidate: candidate_roots.append(stem_candidate) candidate_suffixes.append(suffix_candidate) return # Bana, Sana -> ben, sen if stem_candidate == "ban" and suffix_candidate == "a": candidate_roots.append("ben") candidate_suffixes.append("a") elif stem_candidate == "Ban" and suffix_candidate == "a": candidate_roots.append("Ben") candidate_suffixes.append("a") elif stem_candidate == "san" and suffix_candidate == "a": candidate_roots.append("sen") candidate_suffixes.append("a") elif stem_candidate == "San" and suffix_candidate == "a": candidate_roots.append("Sen") candidate_suffixes.append("a") else: candidate_roots.append(stem_candidate) candidate_suffixes.append(suffix_candidate) if len(stem_candidate) > 2 and len(suffix_candidate) > 0 \ and stem_candidate[-1] == suffix_candidate[0] \ and stem_candidate[-1] in TurkishStemSuffixCandidateGenerator.CONSONANT_STR: # CONSONANT DERIVATION # his -i > hissi, hak -ı > hakkı, red -i > reddi candidate_roots.append(stem_candidate) candidate_suffixes.append(suffix_candidate[1:]) elif len(stem_candidate) > 1 and \ TurkishStemSuffixCandidateGenerator.ENDS_NARROW_REGEX.match(stem_candidate) and \ "yor" in suffix_candidate: # bekle -yor > bekliyor, atla -yor > atliyor if stem_candidate.endswith("i") or stem_candidate.endswith( "ü"): candidate_roots.append(stem_candidate[:-1] + "e") candidate_suffixes.append(suffix_candidate) elif stem_candidate.endswith("ı") or stem_candidate.endswith( "u"): candidate_roots.append(stem_candidate[:-1] + "a") candidate_suffixes.append(suffix_candidate) if len(stem_candidate) > 2 and \ TurkishStemSuffixCandidateGenerator.ENDS_TWO_CONSONANT_REG.match(stem_candidate) and \ TurkishStemSuffixCandidateGenerator.STARTS_VOWEL_REGEX.match(suffix_candidate): # VOWEL DROP # ağız – ım > ağzım, alın –ın – a > alnına # burun –u > burnu, bağır –ım > bağrım, beyin –i > beyni suffix_start_letter = to_lower(suffix_candidate[0]) if suffix_start_letter in ["u", "ü", "ı", "i"]: candidate_roots.append(stem_candidate[:-1] + suffix_start_letter + stem_candidate[-1]) candidate_suffixes.append(suffix_candidate) elif suffix_start_letter == "e": candidate_roots.append(stem_candidate[:-1] + "i" + stem_candidate[-1]) candidate_suffixes.append(suffix_candidate) candidate_roots.append(stem_candidate[:-1] + "ü" + stem_candidate[-1]) candidate_suffixes.append(suffix_candidate) elif suffix_start_letter == "a": candidate_roots.append(stem_candidate[:-1] + "ı" + stem_candidate[-1]) candidate_suffixes.append(suffix_candidate) candidate_roots.append(stem_candidate[:-1] + "u" + stem_candidate[-1]) candidate_suffixes.append(suffix_candidate) if len( stem_candidate ) > 2 and TurkishStemSuffixCandidateGenerator.ENDS_WITH_SOFT_CONSONANTS_REGEX.match( stem_candidate): # Softening of consonants candidate_roots.append( TurkishStemSuffixCandidateGenerator. _transform_soft_consonants(stem_candidate)) candidate_suffixes.append(suffix_candidate)
def __clean_firstname(self, token): """ First name cleaning. Remove accents, convert to lower case. """ return utils.to_lower(utils.substitute_accents(token))