Python QArabicSymbolsFilter.normalize_allの例

プログラミング言語: Python

名前空間/パッケージ名: alfanous.TextProcessing

メソッド/関数: normalize_all

hotexamples.comのコード掲載数: 7

Python QArabicSymbolsFilter.normalize_all - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのalfanous.TextProcessing.QArabicSymbolsFilter.normalize_allの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

QArabicSymbolsFilter(9)

normalize_all(4)

コード例 #1

ファイルを表示

    class SpellErrors(QMultiTerm):
        """
        query that ignores  the spell errors of arabic letters such as:
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza forms
        """
        def __init__(self, fieldname, text, boost=1.0):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter(shaping=True,
                                            tashkil=False,
                                            spellerrors=True,
                                            hamza=True)

        def _words(self, ixreader):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare(self.text, indexed_text):
                        yield indexed_text

        def _compare(self, first, second):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all(
                second))
            if eqiv:
                self.words.append(second)
            return eqiv

コード例 #2

ファイルを表示

ファイル: QueryProcessing.py プロジェクト: xsoh/alfanous

    class SpellErrors( QMultiTerm ):
        """
        query that ignores  the spellerrors of arabic letters
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza formes
        """


        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter( shaping = True, tashkil = False, spellerrors = True, hamza = True )


        def _words( self, ixreader ):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare( self.text, indexed_text ):
                        yield indexed_text

        def _compare( self, first, second ):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = ( self.ASF.normalize_all( first ) == self.ASF.normalize_all( second ) )
            if eqiv:
                self.words.append( second )
            return eqiv

コード例 #3

ファイルを表示

 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     ASF = QArabicSymbolsFilter(shaping=False,
                                tashkil=True,
                                spellerrors=False,
                                hamza=False)
     self.words = [ASF.normalize_all(word) for word in text]

コード例 #4

ファイルを表示

ファイル: QueryProcessing.py プロジェクト: Alfanous-team/alfanous

        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            ASF = QArabicSymbolsFilter( shaping = False,
									   	tashkil = True,
									   	spellerrors = False,
									   	hamza = False )
            self.words = [ASF.normalize_all( word ) for word in text]

コード例 #5

ファイルを表示

# coding: utf-8
"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_

if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all(TEXT)
    print TEXT

    WORD1 = unicode_(u"عَاصِمُ")
    WORD2 = unicode_(u"عَاصمُ")
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_(u"فاعل")
    PHRASE = unicode_(u"كانَ")
    print WORD3.apply_harakat_list(LIST_HARAKAT1)
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2)
    print WORD1.shakl_compare(WORD1, WORD2)
    for i in PHRASE.tokenize_shakl():
        print i,

    WORD4 = unicode_(u"عاصم")
    WORD5 = unicode_(u"عاصِم")

    print WORD4 == WORD5

コード例 #6

ファイルを表示

ファイル: TextProcessing.py プロジェクト: 01walid/alfanous

# coding: utf-8

"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_


if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all( TEXT )
    print TEXT

    WORD1 = unicode_( u"عَاصِمُ" )
    WORD2 = unicode_( u"عَاصمُ" )
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_( u"فاعل" )
    PHRASE = unicode_( u"كانَ" )
    print WORD3.apply_harakat_list( LIST_HARAKAT1 )
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat( LIST_HARAKAT1, LIST_HARAKAT2 )
    print WORD1.shakl_compare( WORD1, WORD2 )
    for i in PHRASE.tokenize_shakl():
        print i,
    
    WORD4 = unicode_( u"عاصم" )
    WORD5 = unicode_( u"عاصِم" )

コード例 #7

ファイルを表示

ファイル: Importer.py プロジェクト: zrelli/alfanous

    def __init__( self, QC_PATH = "../../store/quranic-corpus-morpology.xml", DB = "main.db" ):
        """ make word table """

        import sqlite3

        print "connecting to database ...",
        maindb = sqlite3.connect( DB )
        cur = maindb.cursor()
        print "OK"

        print "creating tables:"
	cur.execute( """ drop table if exists wordQC""" )
        cur.execute( 
                        """ create table if not exists  wordQC(
                        gid int unique,
                        word_gid int,
                        word_id int,
                        aya_id int,
                        sura_id int,

                        word varchar(25),
                        normalised varchar(25),
                        spelled varchar(25),
                        'order' int,
                        token varchar(25),
                        arabictoken varchar(25),
                        prefixes varchar(25),
                        suffixes varchar(25),


                        pos varchar(25),
                        type varchar(25),
                        arabicpos varchar(25),
                        mood varchar(25),
                        arabicmood varchar(25),
                        'case' varchar(25),
                        arabiccase varchar(25),
                        root varchar(25),
                        arabicroot varchar(25),
                        lemma varchar(25),
                        arabiclemma varchar(25),
                        special varchar(25),
                        arabicspecial varchar(25),

                        derivation varchar(25),
                        form varchar(25),
                        gender varchar(25),
                        person varchar(25),
                        number varchar(25),
                        voice varchar(25),
                        state varchar(25),
                        aspect varchar(25),

                        primary key(gid)

                    )

                    """ )
        print ">wordQC table ... OK"


        print ">loading Qurany Corpus...",
        from PyCorpus.QuranyCorpus import API as QC
        A = QC( source = QC_PATH )
        print ".OK\n"
        IFEXIST = lambda d, attrib: d[attrib].encode( "utf-8" ) if attrib in d else ""
        gid, word_gid = 0, 0
        print ">inserting values of gid...",
        for iteration in A.all_words_generator():
            QASF = QArabicSymbolsFilter( shaping = True, 
                                         tashkil = True, 
                                         spellerrors = False, 
                                         hamza = False, 
                                         uthmani_symbols = True )
            QASF_spelled = QArabicSymbolsFilter( shaping = True, 
                                                 tashkil = True, 
                                                 spellerrors = True, 
                                                 hamza = True, 
                                                 uthmani_symbols = True
                                                 )

            QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood,
                arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial,
                word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values
                ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s",  "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s",
                "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s",
                "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % {
										    "gid":gid,
										    "word_gid":word_gid,
										    "word_id":iteration["word_id"],
										    "aya_id":iteration["aya_id"],
										    "sura_id":iteration["sura_id"],
										    "order":order,
										    "token":IFEXIST( d, "token" ),
										    "arabictoken":IFEXIST( d, "arabictoken" ),
										    "prefixes":";".join([prefix["arabictoken"] for prefix in glob["prefixes"] ]).encode( "utf-8" ),
										    "suffixes":";".join([suffix["arabictoken"] for suffix in glob["suffixes"] ]).encode( "utf-8" ),
										    "type":IFEXIST( d, "type" ),
										    "pos":IFEXIST( d, "pos" ),
										    "arabicpos":IFEXIST( d, "arabicpos" ),
										    "mood":IFEXIST( d, "mood" ),
										    "arabicmood":IFEXIST( d, "arabicmood" ),
										    "case":IFEXIST( d, "case" ),
										    "arabiccase":IFEXIST( d, "arabiccase" ),
										    "root":IFEXIST( d, "root" ),
										    "arabicroot":IFEXIST( d, "arabicroot" ),
										    "lemma":IFEXIST( d, "lemma" ),
										    "arabiclemma":IFEXIST( d, "arabiclemma" ),
										    "special":IFEXIST( d, "special" ),
										    "arabicspecial":IFEXIST( d, "arabicspecial" ),
										    "word":iteration["word"].encode( "utf-8" ),
										    "normalised":  QASF.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "spelled": QASF_spelled.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "derivation":IFEXIST( d, "derivation" ),
										    "form":IFEXIST( d, "form" ),
										    "gender":IFEXIST( d, "gender" ),
										    "person":IFEXIST( d, "person" ),
										    "number":IFEXIST( d, "number" ),
										    "voice":IFEXIST( d, "voice" ),
										    "state":IFEXIST( d, "state" ),
										    "aspect":IFEXIST( d, "aspect" )
										    }
            word_gid += 1
            if word_gid % 1000 == 0:
                print word_gid,
            print("\n")

            order = 0
            for d in iteration["morphology"]["base"]:
                gid += 1
                order += 1
                cur.execute( QUERY( d, iteration["morphology"] ) )

        print("OK")
        maindb.commit()