Beispiel #1
0
def calculate_acc_on_testfile(file_path):
    model = AnalysisScorerModel.create_from_existed_model(
        model_name="lookup_disambiguator_wo_suffix")
    test_data = data_generator(file_path,
                               add_gold_labels=True,
                               case_sensitive=True)
    corrects = 0
    total = 0
    with open("data/incorrect_analyzes.csv", "w", encoding="UTF-8") as f:
        f.write("Surface\tGold\tPredicted\n")
        for sentence in test_data:
            predicted_indexes = model.predict_indices(sentence)
            for word, selected_index in zip(sentence, predicted_indexes):
                gold_analysis = word.roots[0] + "+" + "+".join(word.tags[0])
                gold_analysis = gold_analysis.replace("+DB", "^DB")
                selected_analysis = word.roots[
                    selected_index] + "+" + "+".join(word.tags[selected_index])
                selected_analysis = selected_analysis.replace("+DB", "^DB")
                if to_lower(selected_analysis) == to_lower(gold_analysis):
                    corrects += 1
                else:
                    f.write("{}\t{}\t{}\n".format(word.surface_word,
                                                  gold_analysis,
                                                  selected_analysis))
                total += 1
        print("Accuracy: {}".format(corrects * 1.0 / total))
Beispiel #2
0
 def parse_answers(self, offset: int, lst: list, cnt: int):
     for _ in range(cnt):
         a_name, offset = self.get_name(self.data, offset)
         a_type, a_class, a_ttl, a_data_len = unpack(
             "!HHIH", self.data[offset: offset + 10])
         a_data, offset = self.parse_a_data(
             a_data_len, a_type, self.data, offset + 10)
         if a_type == DnsMessage.NS:
             a_data = to_lower(a_data)
         answer = DnsAnswer(to_lower(a_name), a_type,
                            a_class, a_ttl, a_data_len, a_data)
         lst.append(answer)
     return offset
Beispiel #3
0
    def __init__(self, data: bytes):
        lst = unpack("!HHHHHH", data[:12])
        self.isResponse = lst[1] & int('1000000000000000', 2) >> 15
        self.recursion = lst[1] & int('0000001000000000', 2) >> 9
        self.id = lst[0]
        self.question_count = lst[2]
        self.answer_count = lst[3]
        self.authority_count = lst[4]
        self.additional_count = lst[5]
        self.questions = []
        self.answers = []
        self.authority = []
        self.additional = []
        self.data = copy.copy(data)
        self.questions_offset = 12
        offset = 12
        for _ in range(self.question_count):
            q_name, offset = self.get_name(self.data, offset)
            self.questions_name_end = offset
            q_type, q_class = unpack("!HH", self.data[offset: offset + 4])
            question = DnsQuery(to_lower(q_name), q_type, q_class)
            self.questions.append(question)
            offset += 4

        self.answers_offset = offset
        self.authority_offset = self.parse_answers(
            offset, self.answers, self.answer_count)
        self.additional_offset = self.parse_answers(
            self.authority_offset, self.authority, self.authority_count)
        self.parse_answers(self.additional_offset,
                           self.additional, self.additional_count)
Beispiel #4
0
 def predict(self, tokens):
     sentence = []
     for token in tokens:
         token = to_lower(token)
         candidate_analyzes = self.candidate_generator.get_analysis_candidates(
             token)
         roots = []
         tags = []
         for analysis in candidate_analyzes:
             roots.append(analysis[0])
             tags.append(analysis[2])
         sentence.append(WordStruct(token, roots, [], tags, 0))
     selected_indices = self.predict_indices(sentence)
     res = []
     for i, j in enumerate(selected_indices):
         if "Prop" in sentence[i].tags[j]:
             sentence[i].roots[j] = capitalize(sentence[i].roots[j])
         if sentence[i].tags[j] == "Unknown":
             selected_analysis = sentence[i].roots[j] + "+" + sentence[
                 i].tags[j]
         else:
             selected_analysis = sentence[i].roots[j] + "+" + "+".join(
                 sentence[i].tags[j])
             selected_analysis = selected_analysis.replace("+DB", "^DB")
         res.append(selected_analysis)
     return res
Beispiel #5
0
def pre_process(df_data, args):
    #drop specific columns from the analysis
    df_data = utils.reduce_data(df_data)

    #get the columns in data frame
    columns = list(df_data.columns.values)

    list_categories = []
    for column in columns:
        print("procesando variable: ", column)
        #get levels and frecuency of each one per column
        levels = df_data[column].value_counts().index.tolist()
        frecuency = df_data[column].value_counts().values.tolist()

        #check if the colum contains binar data like si, or no: string#¢|@. Return boolean
        binar = utils.check_binar(
            utils.to_lower(df_data[column].value_counts().index.tolist()))

        #reduce levels by frecuency with the argument tol.
        #if a level in the column doesn't have meet the criterium frecuency/len_total_data is ignored
        levels, frecuency = utils.reduce_levels(levels,
                                                frecuency,
                                                df_data.shape[0],
                                                tol=args.tol)
        levels = list(set(levels))

        #process every colum like binar o with levels
        clean_column(df_data, unicode(column), levels, binar)
        print

    return df_data
 def suffix_transform_single(cls, candidate_suffix):
     candidate_suffix = to_lower(candidate_suffix)
     candidate_suffix = cls.SUFFIX_TRANSFORMATION_REGEX1.sub(
         "A", candidate_suffix)
     candidate_suffix = cls.SUFFIX_TRANSFORMATION_REGEX2.sub(
         "H", candidate_suffix)
     return candidate_suffix
    def get_stem_suffix_candidates(self, surface_word):
        candidate_roots = []
        candidate_suffixes = []
        for i in range(1, len(surface_word)):
            candidate_root = surface_word[:i]
            candidate_suffix = surface_word[i:]
            if not self.case_sensitive:
                candidate_root = to_lower(candidate_root)
                candidate_suffix = to_lower(candidate_suffix)
                self._add_candidate_stem_suffix(candidate_root,
                                                candidate_suffix,
                                                candidate_roots,
                                                candidate_suffixes)
            else:
                candidate_suffix = to_lower(candidate_suffix)
                self._add_candidate_stem_suffix(to_lower(candidate_root),
                                                candidate_suffix,
                                                candidate_roots,
                                                candidate_suffixes)
                if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                        candidate_root):
                    self._add_candidate_stem_suffix(capitalize(candidate_root),
                                                    candidate_suffix,
                                                    candidate_roots,
                                                    candidate_suffixes)

        candidate_suffixes.append("")
        candidate_roots.append(to_lower(surface_word))
        if self.case_sensitive and TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                surface_word):
            candidate_suffixes.append("")
            candidate_roots.append(capitalize(surface_word))

        assert len(candidate_roots) == len(candidate_suffixes)
        TurkishStemSuffixCandidateGenerator._root_transform(candidate_roots)
        if self.asciification:
            candidate_roots = [
                asciify(candidate_root) for candidate_root in candidate_roots
            ]
            candidate_suffixes = [
                asciify(candidate_suffix)
                for candidate_suffix in candidate_suffixes
            ]
        if self.suffix_normalization:
            TurkishStemSuffixCandidateGenerator.suffix_transform(
                candidate_suffixes)
        return candidate_roots, candidate_suffixes
Beispiel #8
0
def evaluate_candidate_generation(file_path, max_words=0, case_sensitive=True):
    candidate_generator = TurkishStemSuffixCandidateGenerator(
        case_sensitive=case_sensitive)
    result = []
    with open(file_path, "r", encoding="UTF-8") as f:
        for i, line in enumerate(f):
            if i % 1000 == 0:
                print("Line: {}".format(i))
            if 0 < max_words < i:
                break
            trimmed_line = line.strip(" \r\n\t")
            if trimmed_line.startswith("<"):
                continue
            else:
                parses = re.split(r"[\t ]", trimmed_line)
                surface = parses[0]
                candidates = candidate_generator.get_analysis_candidates(
                    surface)
                roots = []
                suffixes = []
                tags = []
                analyzes = parses[1:]
                gold_root = get_root_from_analysis(analyzes[0])
                if not case_sensitive:
                    gold_root = to_lower(gold_root)
                gold_tag = convert_tag_list_to_str(
                    standardize_tags(get_tags_from_analysis(analyzes[0])))
                does_contain = False
                for candidate_root, candidate_suffix, candidate_tag in candidates:
                    roots.append(candidate_root)
                    suffixes.append(candidate_suffix)
                    tags.append(convert_tag_list_to_str(candidate_tag))
                    if candidate_root == gold_root and convert_tag_list_to_str(
                            candidate_tag) == gold_tag:
                        does_contain = True
                if not does_contain:
                    if gold_root in roots:
                        correct_root_candidate = gold_root
                    else:
                        correct_root_candidate = "Not Exist"

                    if gold_root in roots:
                        found_analyzes = "\n".join(tags)
                    else:
                        found_analyzes = ""
                    result.append({
                        "Surface Word": surface,
                        "Gold root": gold_root,
                        "Gold Tags": gold_tag,
                        "Selected root candidate": correct_root_candidate,
                        "Found Tag Sequences": found_analyzes
                    })
    df = pd.DataFrame(result,
                      index=None,
                      columns=[
                          "Surface Word", "Gold root", "Gold Tags",
                          "Selected root candidate", "Found Tag Sequences"
                      ])
    df.to_excel("Candidate Generation Error Analysis.xlsx")
 def read_stem_list(self):
     with open(TurkishStemSuffixCandidateGenerator.STEM_LIST_FILE_PATH,
               "r",
               encoding="UTF-8") as f:
         for line in f:
             splits = line.strip().split("\t")
             stem = splits[0]
             if not self.case_sensitive:
                 stem = to_lower(stem)
             flag = int(splits[1].strip())
             postags = TurkishStemSuffixCandidateGenerator._parse_flag(flag)
             if stem in self.stem_dic:
                 self.stem_dic[stem] = list(
                     set(list(postags) + self.stem_dic[stem]))
             else:
                 self.stem_dic[stem] = postags
def process_column(list_data, column_name, args):
    categories = set(list_data)
    categories = list(utils.to_lower(list(categories)))
    print("low categories", categories)

    binar = utils.check_binar(categories)

    if binar:
        print("binar category")
        binar_data = simple_binarization(list_data, categories, column_name)

    else:
        print("multivariate category")
        binar_data = binar_column(list_data, categories, column_name)

    return binar_data
def create_json_categories(df_data, path):
	#get the columns in data frame
	columns = list(df_data.columns.values)

	list_categories = []
	for column in columns:
		#get levels per column
		levels = df_data[column].value_counts().index.tolist()
		frecuencies = df_data[column].value_counts().values.tolist()

		#check if the column is a binar category
		binar = utils.check_binar(utils.to_lower(df_data[column].value_counts().index.tolist()))

		#create dict of levels and categories
		dict_levels = create_column_json(levels, frecuencies, column, binar)
		#add every item in the dict to a list
		append_data_dict(list_categories, dict_levels)

	#print("list categories",list_categories)
	with open(path + "/Data/categories.json", 'w') as f:
		f.write(json.dumps(list_categories, f))
Beispiel #12
0
 def __init__(self, string, opinions):
     self.string = string
     self.opinions = opinions
     self.tokens = utils.filter_symbol(utils.to_lower(tokenize(string)))
     self.ids = None
    def get_analysis_candidates(self, surface_word):
        if to_lower(surface_word) in self.exact_lookup_table:
            cur_candidate_analyzes = []
            analyzes = self.exact_lookup_table[to_lower(surface_word)]
            for analysis in analyzes:
                suffix = analysis.split("/")[0]
                root = TurkishStemSuffixCandidateGenerator.TAG_SEPARATOR_REGEX.split(
                    analysis.split("/")[1])[0]
                tags = TurkishStemSuffixCandidateGenerator.TAG_SEPARATOR_REGEX.split(
                    analysis.split("/")[1])[1:]
                cur_candidate_analyzes.append((root, suffix, tags))
            return cur_candidate_analyzes
        candidate_analyzes = []
        candidate_analyzes_str = []
        candidate_roots, candidate_suffixes = self.get_stem_suffix_candidates(
            surface_word)
        for candidate_root, candidate_suffix in zip(candidate_roots,
                                                    candidate_suffixes):
            if TurkishStemSuffixCandidateGenerator.NON_WORD_REGEX.match(
                    candidate_root):
                if TurkishStemSuffixCandidateGenerator.CONTAINS_NUMBER_REGEX.match(
                        candidate_root):
                    stem_tags = ["Num", "Noun+Time"]
                else:
                    stem_tags = ["Punc"]
            elif len(candidate_suffix
                     ) == 0 and candidate_root not in self.stem_dic:
                # stem_tags = ["Noun", "Noun+Prop"]
                continue
            elif candidate_root not in self.stem_dic:
                if "'" in candidate_suffix and candidate_suffix in self.suffix_dic:
                    stem_tags = ["Noun+Prop"]
                else:
                    continue
            else:
                stem_tags = self.stem_dic[candidate_root]
                if not TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(candidate_root)\
                    and "Noun+Prop" in stem_tags:                    \
                                            stem_tags.remove("Noun+Prop")
                elif TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(candidate_root) \
                        and "Noun+Prop" in stem_tags:
                    stem_tags = ["Noun+Prop"]
                elif candidate_suffix.startswith("'") and candidate_suffix in self.suffix_dic \
                        and "Noun+Prop" in stem_tags:
                    stem_tags = ["Noun+Prop"]
                elif TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                        candidate_root):
                    continue

            candidate_tags = self.get_tags(candidate_suffix, stem_tags)
            cur_candidate_analyzes = []
            for candidate_tag in candidate_tags:
                if to_lower(candidate_root) + "+" + "+".join(
                        candidate_tag).replace(
                            "+DB", "^DB") not in candidate_analyzes_str:
                    cur_candidate_analyzes.append(
                        (to_lower(candidate_root), candidate_suffix,
                         candidate_tag))
                    candidate_analyzes_str.append(
                        to_lower(candidate_root) + "+" +
                        "+".join(candidate_tag).replace("+DB", "^DB"))
            candidate_analyzes += cur_candidate_analyzes
        if len(candidate_analyzes) == 0:
            candidate_analyzes.append((to_lower(surface_word), "", "Unknown"))
        return candidate_analyzes
Beispiel #14
0
def load_data(file_path,
              max_sentence=0,
              add_gold_labels=True,
              case_sensitive=False):
    sentences = []
    sentence = []
    candidate_generator = TurkishStemSuffixCandidateGenerator(
        case_sensitive=case_sensitive)
    with open(file_path, "r", encoding="UTF-8") as f:
        for i, line in enumerate(f):
            if 0 < max_sentence < i:
                break
            trimmed_line = line.strip(" \r\n\t")
            trimmed_line = trimmed_line.replace("s", "s")
            if trimmed_line.startswith("<S>") or trimmed_line.startswith(
                    "<s>"):
                sentence = []
            elif trimmed_line.startswith("</S>") or trimmed_line.startswith(
                    "</s>"):
                if len(sentence) > 0:
                    sentences.append(sentence)
            elif len(
                    trimmed_line
            ) == 0 or "<DOC>" in trimmed_line or trimmed_line.startswith(
                    "</DOC>") or trimmed_line.startswith(
                        "<TITLE>") or trimmed_line.startswith("</TITLE>"):
                pass
            else:
                parses = re.split(r"[\t ]", trimmed_line)
                surface = parses[0]
                candidates = candidate_generator.get_analysis_candidates(
                    surface)
                roots = []
                suffixes = []
                tags = []
                ambiguity_level = 0
                if add_gold_labels:
                    analyzes = parses[1:]
                    ambiguity_level = len(analyzes)
                    gold_root = get_root_from_analysis(analyzes[0])
                    gold_root = to_lower(gold_root)
                    roots.append(gold_root)
                    gold_suffix = surface[len(gold_root):]
                    if not case_sensitive:
                        gold_suffix = to_lower(gold_suffix)
                    suffixes.append(gold_suffix)
                    gold_tag = standardize_tags(
                        get_tags_from_analysis(analyzes[0]))
                    tags.append(gold_tag)

                    for candidate_root, candidate_suffix, candidate_tag in candidates:
                        if to_lower(candidate_root) != to_lower(
                                gold_root) or "".join(
                                    candidate_tag) != "".join(gold_tag):
                            roots.append(to_lower(candidate_root))
                            suffixes.append(candidate_suffix)
                            tags.append(candidate_tag)
                        elif candidate_suffix != gold_suffix and candidate_root == gold_root:
                            suffixes[0] = candidate_suffix
                else:
                    for candidate_root, candidate_suffix, candidate_tag in candidates:
                        roots.append(candidate_root)
                        suffixes.append(candidate_suffix)
                        tags.append(candidate_tag)
                    if len(roots) == 0:
                        if TurkishStemSuffixCandidateGenerator.STARTS_WITH_UPPER.match(
                                surface):
                            candidate_tags = candidate_generator.get_tags(
                                "", stem_tags=["Noun", "Noun+Prop"])
                        else:
                            candidate_tags = candidate_generator.get_tags(
                                "", stem_tags=["Noun"])
                        for candidate_tag in candidate_tags:
                            if "Prop" in candidate_tag:
                                roots.append(capitalize(surface))
                                suffixes.append("")
                                tags.append(candidate_tag)
                            else:
                                roots.append(to_lower(surface))
                                suffixes.append("")
                                tags.append(candidate_tag)
                if not case_sensitive:
                    surface = to_lower(surface)
                    roots = [to_lower(root) for root in roots]
                    suffixes = [to_lower(suffix) for suffix in suffixes]
                current_word = WordStruct(surface, roots, suffixes, tags,
                                          ambiguity_level)
                sentence.append(current_word)
    return sentences
Beispiel #15
0
def to_lower(token):
    """
    Token to lower case.
    """
    return utils.to_lower(token)
    def _add_candidate_stem_suffix(stem_candidate, suffix_candidate,
                                   candidate_roots, candidate_suffixes):
        if "'" in suffix_candidate:
            candidate_roots.append(stem_candidate)
            candidate_suffixes.append(suffix_candidate)
            return

        # Bana, Sana -> ben, sen
        if stem_candidate == "ban" and suffix_candidate == "a":
            candidate_roots.append("ben")
            candidate_suffixes.append("a")
        elif stem_candidate == "Ban" and suffix_candidate == "a":
            candidate_roots.append("Ben")
            candidate_suffixes.append("a")
        elif stem_candidate == "san" and suffix_candidate == "a":
            candidate_roots.append("sen")
            candidate_suffixes.append("a")
        elif stem_candidate == "San" and suffix_candidate == "a":
            candidate_roots.append("Sen")
            candidate_suffixes.append("a")
        else:
            candidate_roots.append(stem_candidate)
            candidate_suffixes.append(suffix_candidate)
            if len(stem_candidate) > 2 and len(suffix_candidate) > 0 \
                    and stem_candidate[-1] == suffix_candidate[0] \
                    and stem_candidate[-1] in TurkishStemSuffixCandidateGenerator.CONSONANT_STR:
                # CONSONANT DERIVATION
                # his -i > hissi, hak -ı > hakkı, red -i > reddi
                candidate_roots.append(stem_candidate)
                candidate_suffixes.append(suffix_candidate[1:])
            elif len(stem_candidate) > 1 and \
                    TurkishStemSuffixCandidateGenerator.ENDS_NARROW_REGEX.match(stem_candidate) and \
                    "yor" in suffix_candidate:
                # bekle -yor > bekliyor, atla -yor > atliyor
                if stem_candidate.endswith("i") or stem_candidate.endswith(
                        "ü"):
                    candidate_roots.append(stem_candidate[:-1] + "e")
                    candidate_suffixes.append(suffix_candidate)
                elif stem_candidate.endswith("ı") or stem_candidate.endswith(
                        "u"):
                    candidate_roots.append(stem_candidate[:-1] + "a")
                    candidate_suffixes.append(suffix_candidate)
            if len(stem_candidate) > 2 and \
                    TurkishStemSuffixCandidateGenerator.ENDS_TWO_CONSONANT_REG.match(stem_candidate) and \
                    TurkishStemSuffixCandidateGenerator.STARTS_VOWEL_REGEX.match(suffix_candidate):
                # VOWEL DROP
                # ağız – ım > ağzım, alın –ın – a > alnına
                # burun –u > burnu, bağır –ım > bağrım, beyin –i > beyni
                suffix_start_letter = to_lower(suffix_candidate[0])
                if suffix_start_letter in ["u", "ü", "ı", "i"]:
                    candidate_roots.append(stem_candidate[:-1] +
                                           suffix_start_letter +
                                           stem_candidate[-1])
                    candidate_suffixes.append(suffix_candidate)
                elif suffix_start_letter == "e":
                    candidate_roots.append(stem_candidate[:-1] + "i" +
                                           stem_candidate[-1])
                    candidate_suffixes.append(suffix_candidate)
                    candidate_roots.append(stem_candidate[:-1] + "ü" +
                                           stem_candidate[-1])
                    candidate_suffixes.append(suffix_candidate)
                elif suffix_start_letter == "a":
                    candidate_roots.append(stem_candidate[:-1] + "ı" +
                                           stem_candidate[-1])
                    candidate_suffixes.append(suffix_candidate)
                    candidate_roots.append(stem_candidate[:-1] + "u" +
                                           stem_candidate[-1])
                    candidate_suffixes.append(suffix_candidate)
            if len(
                    stem_candidate
            ) > 2 and TurkishStemSuffixCandidateGenerator.ENDS_WITH_SOFT_CONSONANTS_REGEX.match(
                    stem_candidate):
                # Softening of consonants
                candidate_roots.append(
                    TurkishStemSuffixCandidateGenerator.
                    _transform_soft_consonants(stem_candidate))
                candidate_suffixes.append(suffix_candidate)
Beispiel #17
0
 def __clean_firstname(self, token):
     """
     First name cleaning.
     Remove accents, convert to lower case.
     """
     return utils.to_lower(utils.substitute_accents(token))