Beispiel #1
0
def assistanttashkeel(text):
    """
    get tashkeel with suggestions
    """
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    vocalized_text = vocalizer.assistanttashkeel(text)
    return vocalized_text
Beispiel #2
0
def tashkeel2(text, lastmark):
    """
    Tashkeel text with suggestions
    """
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~ vocalizer.disable_cache()
    if lastmark == "0" or not lastmark:
        vocalizer.disable_last_mark()
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)
    return vocalized_dict
Beispiel #3
0
def tashkeel_text(text, lastmark=True):
    """
    Tashkeel text without suggestions
    """
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~ print "lastMark", lastmark
    if lastmark == "0":
        vocalizer.disable_last_mark()
    vocalized_text = vocalizer.tashkeel(text)
    return vocalized_text
Beispiel #4
0
def test():
    args = grabargs()

    filename = args.filename
    filename2 = args.compareto  # used for comparison
    if filename2:
        compare = True
    else:
        compare = False
    outfilename = args.outfile
    text = args.text
    if not text and not filename:
        print('Try: mishkal-console.py -h')
        sys.exit(0)
    # tashkeel command
    command = args.command
    strip_tashkeel = False
    reducedTashkeel = False
    commandTashkeel = False
    if command == "strip":
        strip_tashkeel = True
    elif command == "reduce":
        reducedTashkeel = True
    else:
        commandTashkeel = True
    # general options
    limit = args.limit
    progress = args.progress
    verbose = args.verbose

    # options
    ignore = args.ignore
    cache = args.cache
    disableSyntax = args.syntax
    disableSemantic = args.semantic
    disableStat = args.stat
    enable_syn_train = args.train
    evaluation = args.evaluation

    # Open file
    if not text:
        try:
            myfile = open(filename, encoding='utf8')
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + ".Tashkeel.txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.strip().split('\n')
    if compare and filename2:
        try:
            myfile2 = open(filename2, encoding='utf8')
            print("input file2:", filename2)
        except:
            print(" Can't Open the given File ", filename2)
            sys.exit()

    # all things are well, import library

    myconsole = tashkeel_console.Tashkeel_console()
    #~ myconsole.counter = 1
    myconsole.limit = limit
    if not limit:
        # count lines in files if filename, otherwise count lines in text
        if filename:
            with open(filename) as f:
                limit = sum(1 for line in f)
        else:
            limit = len(lines)
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if cache:
            vocalizer.enable_cache()
            sys.stderr.write(" Mishkal use a cache")
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
        # if verbose option, then activate logger in ArabicVocalizer
        if verbose:
            vocalizer.enable_verbose()

    if not text:
        line = (myfile.readline())  #.decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
        # get the next line to compare
    if compare:
        line_base = myfile2.readline().strip()
    if evaluation:
        myconsole.header()

    while line and myconsole.counter <= limit:
        line = line.strip()
        #~ myconsole.lineCorrect = 0
        #~ myconsole.lineWLMIncorrect = 0
        if strip_tashkeel:
            result = araby.strip_tashkeel(line)
        elif compare:
            myconsole.compare(line_base, line)
            myconsole.display_line_stat()
            result = line
            print("base :", line_base)
            print("input:", line)
        #~ else:    # vocalize line by line
        elif not evaluation:
            result = vocalizer.tashkeel(line)
            myconsole.total += len(araby.tokenize(line))
        elif evaluation:
            inputUnvocalizedLine = araby.strip_tashkeel(line)
            vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                inputUnvocalizedLine)
            outputlist = [x.get("chosen", '') for x in vocalized_dict]
            result = u" ".join(outputlist)
            myconsole.compare(line, vocalized_dict)
            # display stat for every line
            myconsole.display_line_stat()
        # compare resultLine and vocalizedLine
        if reducedTashkeel:
            result = araby.reduceTashkeel(result)

        if text:
            print(result.strip('\n'), end='')
        else:
            result_line = result
            if verbose:
                print(result_line)
            # add line and new line to output file
            outfile.write(result_line)
            outfile.write("\n")

        if progress:
            # show progress bar
            myconsole.progress(compare)

        myconsole.counter += 1
        # get the next line
        if not text:
            line = (myfile.readline())
        else:
            if myconsole.counter < len(lines):
                line = lines[myconsole.counter]
            else:
                line = None
        # get the next line to compare
        if compare:
            line_base = myfile2.readline().strip()

    if progress:
        myconsole.footer()
Beispiel #5
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~vocalized_text = vocalizer.tashkeel(text)
    #~ vocalizer.disable_cache()

    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)

    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [
        text1,
    ]
    list1 = []
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print(u"\t".join(list1).encode('utf8'))
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print("lists haven't the same length", len(list1), len(list2))
        for i in range(min(len(list1), len(list2))):
            print((u"'%s'\t'%s'" %
                   (list1[i], list2[i].get('chosen', ''))).encode("utf8"))
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct * 100.00 / total, 2)
    per_incorrect = round(incorrect * 100.00 / total, 2)
    result = [
        displayed_html,
        "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect)
    ]
    return result  #correct*100/total