Ejemplo n.º 1
0
    def run(self):
        self.acceptSock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.acceptSock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.acceptSock.bind((self.ip, self.port))
        self.acceptSock.listen(1)
        self.addSocket(self.acceptSock)
        self.vocalizer = ArabicVocalizer.TashkeelClass('/tmp/mishkal_cache/')
        self.vocalizer.set_log_level(50)  # critical
        while self.isRunning():
            ready, _, _ = select.select(self.getSockets(), [], [])
            # only accept connection and skip
            if self.acceptSock in ready:
                conn, addr = self.acceptSock.accept()
                self.addSocket(conn)
                ready.remove(self.acceptSock)
            # for better reading skip if there are no more requests
            if ready == []:
                continue
            # handle outstanding requests
            for conn in ready:
                options = self.getData(conn)
                text = options['text']

                if not text:
                    print 'Debugggg'
                    print text
                    print options
                    print '\n'
                    continue
                #print 'First' + text
                lines = text.split('\n')

                result = u''
                for line in lines:
                    line = line.strip()
                    if line == '':
                        continue
                    if line.startswith('#'):
                        continue

                    lineResult = self.vocalizer.tashkeel(line)
                    result += ' ' + lineResult

                    if self.isDebug():
                        if text:
                            print lineResult.strip('\n').encode('utf8')
                try:
                    answer = result + '\00'
                    print("--- %s seconds ---" %
                          (time.time() - self.start_time))
                    answer = self.post_processing(answer)
                    print str(len(answer.encode('utf-8')))
                    conn.send(str(len(answer.encode('utf-8'))))
                    #conn.send(answer.encode('utf-8'))
                    if self.isDebug():
                        print result.strip('\n').encode('utf8')
                finally:
                    self.closeSock(conn)
                if self.isSocketsEmpty():
                    self.setRunning(false)
Ejemplo n.º 2
0
def assistanttashkeel(text):
    """
    get tashkeel with suggestions
    """
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.assistanttashkeel(text)
    return vocalized_text
Ejemplo n.º 3
0
def tashkeelText(text, lastmark=True):
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    print "lastMark", lastmark
    if lastmark == "0":
        vocalizer.disableLastMark()
    vocalized_text = vocalizer.tashkeel(text)
    return vocalized_text
Ejemplo n.º 4
0
def tashkeel2(text, lastmark):
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    #print (u"lastMark %s"%lastmark).encode('utf8');
    if lastmark == "0" or not lastmark:
        vocalizer.disableLastMark()
    vocalized_dict = vocalizer.tashkeelOuputHtmlSuggest(text)
    #print vocalized_dict
    return vocalized_dict
Ejemplo n.º 5
0
def assistanttashkeel(text):
    """
    get tashkeel with suggestions
    """
    import tashkeel.tashkeel as ArabicVocalizer
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    vocalized_text = vocalizer.assistanttashkeel(text)
    return vocalized_text
Ejemplo n.º 6
0
def tashkeel_text(text, lastmark=True):
    """
    Tashkeel text without suggestions
    """
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    print "lastMark", lastmark
    if lastmark == "0":
        vocalizer.disable_last_mark()
    vocalized_text = vocalizer.tashkeel(text)
    return vocalized_text
def tashkeel_text(text, lastmark=True):
    """
    Tashkeel text without suggestions
    """
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~ print "lastMark", lastmark
    if lastmark == "0":
        vocalizer.disable_last_mark()
    vocalized_text = vocalizer.tashkeel(text)
    return vocalized_text
Ejemplo n.º 8
0
def tashkeel2(text, lastmark):
    """
    Tashkeel text with suggestions
    """
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    #print (u"lastMark %s"%lastmark).encode('utf8')
    if lastmark == "0" or not lastmark:
        vocalizer.disable_last_mark()
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)
    #print vocalized_dict
    return vocalized_dict
Ejemplo n.º 9
0
def tashkeel2(text, lastmark):
    """
    Tashkeel text with suggestions
    """
    import tashkeel.tashkeel as ArabicVocalizer
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~ vocalizer.disable_cache()
    if lastmark == "0" or not lastmark:
        vocalizer.disable_last_mark()
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)
    return vocalized_dict
Ejemplo n.º 10
0
def showCollocations(text):
    """
	Show collocations found in the text.
	The collocations is looked up from a data base extracted from a corpus.
	@param text: a given vocalized text.
	@type text: unicode.
	@return : the text have collocations quoted
	@rtype: unicode
	"""
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.statTashkeel(text)
    return vocalized_text
Ejemplo n.º 11
0
def Comparetashkeel(text):
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text
    text = araby.stripTashkeel(text)
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.tashkeel(text)

    # compare voalized text with a correct text
    text1 = correct_text
    text2 = vocalized_text
    # remove collocations symboles
    text2 = text2.replace("'", "")
    text2 = text2.replace("~", "")

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    list1 = vocalizer.analyzer.tokenize(text1)
    list2 = vocalizer.analyzer.tokenize(text2)
    print u":".join(list1).encode('utf8')
    print u":".join(list2).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length"
    else:
        for i in range(total):
            if araby.vocalizedlike(list1[i], list2[i]):
                correct += 1
            else:
                incorrect += 1

    result = [
        vocalized_text,
        "correct:%0.2f%%" % round(correct * 100.00 / total, 2),
        "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total
    ]
    return result  #correct*100/total;
Ejemplo n.º 12
0
def test():
    options = grabargs()

    filename = options['fname']
    outfilename = options['ofname']
    text = options['text']
    strip_tashkeel = options['strip_tashkeel']
    nocache = options['nocache']
    reducedTashkeel = options['reducedTashkeel']
    disableSyntax = options['disableSyntax']
    disableSemantic = options['disableSemantic']
    disableStat = options['disableStatistic']
    ignore = options['ignore']
    limit = options['limit']
    compare = options['compare']
    progress = options['progress']
    enable_syn_train = options['train']

    # filename = "samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
            print("input file:", filename)
            if not outfilename:
                outfilename = filename + " (Tashkeel).txt"
            print("output file:", outfilename)
            outfile = open(outfilename, "w")
        except:
            print(" Can't Open the given File ", filename)
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not strip_tashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if nocache:
            vocalizer.disable_cache()
            # print "nocache"
        if ignore:
            vocalizer.disable_last_mark()
        if disableSemantic:
            vocalizer.disable_semantic_analysis()
        if disableSyntax:
            vocalizer.disable_syntaxic_analysis()
        if disableStat:
            vocalizer.disable_stat_tashkeel()
        if enable_syn_train:
            vocalizer.enable_syn_train()
            # print "mishkal-console, vocalizer.anasynt.syntax_train_enabled", vocalizer.anasynt.syntax_train_enabled

    # vocalizer.disableShowCollocationMark()
    # print "show delimiter", vocalizer.collo.showDelimiter
    # nolimit = True
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    percent = 0
    if compare:
        # dispaly stats for the current line
        print(
            "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct\tLine"
        )

    while line and (nolimit or counter <= limit):
        if not line.startswith('# '):
            line = line.strip()
            lineCorrect = 0
            lineWLMIncorrect = 0
            if strip_tashkeel:
                result = araby.strip_tashkeel(line)
            else:  # vocalize line by line
                if not compare:
                    result = vocalizer.tashkeel(line)
                if compare:
                    inputVocalizedLine = line
                    inputlist = vocalizer.analyzer.tokenize(inputVocalizedLine)
                    inputUnvocalizedLine = araby.strip_tashkeel(line)
                    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(
                        inputUnvocalizedLine)

                    # stemmer = tashaphyne.stemming.ArabicLightStemmer()
                    # ~texts = vocalizer.analyzer.split_into_phrases(inputVocalizedLine)
                    # ~inputlist = []
                    # ~for txt in texts:
                    # ~inputlist += vocalizer.analyzer.text_tokenize(txt)
                    outputlist = [x.get("chosen", '') for x in vocalized_dict]
                    result = u" ".join(outputlist)
                    outputlistsemi = [
                        x.get("semi", '') for x in vocalized_dict
                    ]
                    total += len(inputlist)
                    lineTotal = len(inputlist)
                    if len(inputlist) != len(outputlist):
                        print("lists haven't the same length")
                        print(len(inputlist), len(outputlist))
                        print(u"# ".join(inputlist).encode('utf8'))
                        print(u"# ".join(outputlist).encode('utf8'))
                    else:
                        for inword, outword, outsemiword in zip(
                                inputlist, outputlist, outputlistsemi):
                            simi = araby.vocalized_similarity(inword, outword)
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # evaluation without last haraka
                                simi2 = araby.vocalized_similarity(
                                    inword, outsemiword)
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1
                            else:
                                correct += 1
                                lineCorrect += 1

            # compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8')
            counter += 1

            # display stat for every line
            if compare:
                print("%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  # id
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  # Strip Correct
                    incorrect,  # fully WER
                    WLMIncorrect,  # Strip WER
                    LettersError,  # LER
                    total  # Total
                ))
                if lineTotal:
                    print("%0.2f%%\t" %
                          round(lineCorrect * 100.00 / lineTotal, 2)
                          )  # line Fully correct
                    print("%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal, 2)
                          )  # line Strip correct

            # ~ print result.strip('\n').encode('utf8'),
            if text:
                print result.strip('\n').encode('utf8'),
            else:
                result_line = result.encode('utf8')
                print result_line
                # add line and new line to output file
                outfile.write(result_line)
                outfile.write("\n")

        if progress and not nolimit:
            # ~percent = (counter * 100/ limit ) if (counter / limit * 100 >percent) else percent
            sys.stderr.write(
                "\r[%d%%]%d/%d lines    Full %0.2f Strip %0.2f     " % (
                    counter * 100 / limit,
                    counter,
                    limit,
                    round(correct * 100.00 / total, 2),  # fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2)  # Strip Correct
                ))
            # ~sys.stderr.write("treatment of "+line.encode('utf8'))
            sys.stderr.flush()

        # get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None
    else:
        print("Done")
Ejemplo n.º 13
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    vocalizer = ArabicVocalizer.TashkeelClass()
    #~vocalized_text = vocalizer.tashkeel(text)
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)

    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [
        text1,
    ]
    list1 = []
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print u"\t".join(list1).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length", len(list1), len(list2)
        for i in range(min(len(list1), len(list2))):
            print(u"'%s'\t'%s'" %
                  (list1[i], list2[i].get('chosen', ''))).encode("utf8")
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct * 100.00 / total, 2)
    per_incorrect = round(incorrect * 100.00 / total, 2)
    result = [
        displayed_html,
        "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect)
    ]
    return result  #correct*100/total
Ejemplo n.º 14
0
import tashkeel.tashkeel as tashkeel


def reshape(text):
    """Reshapes arabic in order to display characters from right to left
    """
    if platform.system() == "Darwin":
        return text
    else:
        reshaped_text = arabic_reshaper.reshape(text)
        bidi_text = get_display(reshaped_text)
        return bidi_text


if __name__ == '__main__':
    f = codecs.open("example_arabic_script.dat", "r", "utf-8")
    transliterator = ALA_LC_Transliterator()
    for line in f:
        print("--------------Original Text--------------")
        text = line.strip()
        print(reshape(text))
        print("--------------Vocalized Text--------------")
        vocalizer = tashkeel.TashkeelClass()
        voc = vocalizer.tashkeel(text)
        print(reshape(voc))
        print("--------------Transliterated Text--------------")
        tr = transliterator.do(voc.strip())
        print(reshape(tr))
        print("#########################################")
    f.close()
Ejemplo n.º 15
0
def assistanttashkeel(text):
    import tashkeel.tashkeel as ArabicVocalizer
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.assistanttashkeel(text)
    return vocalized_text
Ejemplo n.º 16
0
def test():
    filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs(
    )
    #filename="samples/randomtext.txt"
    if not text and not filename:
        usage()
        sys.exit(0)

    if not text:
        try:
            myfile = open(filename)
        except:
            print " Can't Open the given File ", filename
            sys.exit()
    else:
        lines = text.split('\n')
    # all things are well, import library
    import core.adaat
    import pyarabic.araby as araby

    counter = 1
    if not limit:
        limit = 100000000
    if not stripTashkeel:
        vocalizer = ArabicVocalizer.TashkeelClass()
        if ignore:
            vocalizer.disableLastMark()
        if disableSemantic:
            vocalizer.disableSemanticAnalysis()
        if disableSyntax:
            vocalizer.disableSyntaxicAnalysis()
        if disableStat:
            vocalizer.disableStatTashkeel()

    #vocalizer.disableShowCollocationMark();
    #print "show delimiter", vocalizer.collo.showDelimiter;
    #nolimit = True;
    nolimit = False
    if not text:
        line = (myfile.readline()).decode('utf8')
    else:
        if len(lines) > 0:
            line = lines[0]
    correct = 0
    incorrect = 0
    total = 0
    totLetters = 0
    LettersError = 0
    WLMIncorrect = 0
    if compare:
        #dispaly stats for the current line
        print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"

        # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"

    while line and (nolimit or counter <= limit):
        if not line.startswith('#'):
            # lineIncorrect = 0;
            lineCorrect = 0
            lineWLMIncorrect = 0
            if stripTashkeel:
                result = araby.stripTashkeel(line)
            else:  #vocalize line by line
                if compare:
                    vocalizedLine = line
                    line = araby.stripTashkeel(line)
                result = vocalizer.tashkeel(line)
                #compare resultLine and vocalizedLine
                if compare:
                    list1 = vocalizer.analyzer.tokenize(vocalizedLine)
                    list2 = vocalizer.analyzer.tokenize(result)
                    #print u":".join(list1).encode('utf8');
                    #print u":".join(list2).encode('utf8');
                    total += len(list1)
                    lineTotal = len(list1)
                    if len(list1) != len(list2):
                        print "lists haven't the same length"
                    else:
                        for i in range(len(list1)):
                            simi = araby.vocalizedSimilarity(
                                list1[i], list2[i])
                            if simi < 0:
                                LettersError += -simi
                                incorrect += 1
                                # lineIncorrect += 1;
                                # evaluation without last haraka
                                simi2 = araby.vocalizedSimilarity(
                                    araby.stripLastHaraka(list1[i]),
                                    araby.stripLastHaraka(list2[i]))
                                if simi2 < 0:
                                    WLMIncorrect += 1
                                    lineWLMIncorrect += 1

                            else:
                                correct += 1
                                lineCorrect += 1

            #compare resultLine and vocalizedLine
            if reducedTashkeel:
                result = araby.reduceTashkeel(result)
            # print result.encode('utf8');
            counter += 1

            #display stat for every line
            if compare:
                print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t" % (
                    counter - 1,  #id
                    round(correct * 100.00 / total, 2),  #fully Correct
                    round((total - WLMIncorrect) * 100.00 / total,
                          2),  #Strip Correct
                    incorrect,  #fully WER
                    WLMIncorrect,  #Strip WER
                    LettersError,  #LER
                    total,  #Total
                ),
                if lineTotal:
                    print "%0.2f%%\t" % round(lineCorrect * 100.00 / lineTotal,
                                              2),  #line Fully correct
                    print "%0.2f%%\t" % round(
                        (lineTotal - lineWLMIncorrect) * 100.00 / lineTotal,
                        2),  #line Strip correct

            print result.encode('utf8')
        #get the next line
        if not text:
            line = (myfile.readline()).decode('utf8')
        else:
            if counter < len(lines):
                line = lines[counter]
            else:
                line = None