コード例 #1
0
def init():
    """
    To be called by library loader, do not call it in your program 
    """

    global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

    ENGLISH_PHONETIC_DATA = pd.read_csv(
        common.get_resources_path() +
        '/script/english_script_phonetic_data.csv',
        encoding='utf-8')

    ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.ix[:,
                                                        PHONETIC_VECTOR_START_OFFSET:].as_matrix(
                                                        )

    PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1]

    ### Load mapping from ARPABET representation of phoneme to internal ID
    global ARPABET_ID_MAP, ID_ARPABET_MAP

    with open(common.get_resources_path() + '/script/english_arpabet_list.csv',
              'r',
              encoding='utf-8') as infile:
        for ph_id, name in enumerate(iter(infile)):
            name = name.strip()
            ARPABET_ID_MAP[name] = ph_id
            ID_ARPABET_MAP[ph_id] = name
コード例 #2
0
def init():
    """
    To be called by library loader, do not call it in your program 
    """

    global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS 

    ALL_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/all_script_phonetic_data.csv',encoding='utf-8')    
    TAMIL_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/tamil_script_phonetic_data.csv',encoding='utf-8')    

    ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
    TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
コード例 #3
0
def init():
    """
    To be called by library loader, do not call it in your program 
    """

    global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

    ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8')    
    TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8')    

    ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
    TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()

    PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
コード例 #4
0
def init():
    """
    To be called by library loader, do not call it in your program 
    """

    ### Load the ITRANS-script offset map. The map was initially generated using the snippet below (uses the old itrans transliterator)
    ### The map is modified as needed to accomodate extensions and corrections to the mappings
    #
    # base=0x900
    # l=[]
    # for i in range(0,0x80):
    #     c=chr(base+i)
    #     itrans=ItransTransliterator.to_itrans(c,'hi')
    #     l.append((hex(i),c,itrans))
    # print(l)
    #
    # pd.DataFrame(l,columns=['offset_hex','devnag_char','itrans']).to_csv('offset_itrans_map.csv',index=False,encoding='utf-8')

    itrans_map_fname = os.path.join(common.get_resources_path(),
                                    'transliterate', 'offset_itrans_map.csv')
    #itrans_map_fname=r'D:\src\python_sandbox\src\offset_itrans_map.csv'
    itrans_df = pd.read_csv(itrans_map_fname, encoding='utf-8')

    global OFFSET_TO_ITRANS, ITRANS_TO_OFFSET, DUPLICATE_ITRANS_REPRESENTATIONS

    for r in itrans_df.iterrows():
        itrans = r[1]['itrans']
        o = int(r[1]['offset_hex'], base=16)

        OFFSET_TO_ITRANS[o] = itrans

        if langinfo.is_consonant_offset(o):
            ### for consonants, strip the schwa - add halant offset
            ITRANS_TO_OFFSET[itrans[:-1]].extend([o, 0x4d])
        else:
            ### the append assumes that the maatra always comes after independent vowel in the df
            ITRANS_TO_OFFSET[itrans].append(o)

        DUPLICATE_ITRANS_REPRESENTATIONS = {
            'A': 'aa',
            'I': 'ii',
            'U': 'uu',
            'RRi': 'R^i',
            'RRI': 'R^I',
            'LLi': 'L^i',
            'LLI': 'L^I',
            'L': 'ld',
            'w': 'v',
            'x': 'kSh',
            'gj': 'j~n',
            'dny': 'j~n',
            '.n': '.m',
            'M': '.m',
            'OM': 'AUM'
        }
コード例 #5
0
def init():
    """
    To be called by library loader, do not call it in your program 
    """

    global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET

    ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')    

    ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()

    PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]

    ### Load mapping from ARPABET representation of phoneme to internal ID
    global ARPABET_ID_MAP, ID_ARPABET_MAP

    with codecs.open(common.get_resources_path()+'/script/english_arpabet_list.csv','r','utf-8') as infile: 
        for ph_id, name in enumerate(iter(infile)): 
            name=name.strip()
            ARPABET_ID_MAP[name]=ph_id
            ID_ARPABET_MAP[ph_id]=name