Beispiel #1
0
    def __init__(self, db):
        self.dict_handler = DictionaryDBHandler(db)

        rules = open("data/Rule.json", "r")
        rules = rules.read()
        self.tense_structures = json.loads(rules)

        suffices = open("data/suffices.json", "r")
        self.suffices = json.loads(suffices.read())

        self.utility = Utility()
Beispiel #2
0
def test_dict():
    try:
        dicthandler = DictionaryDBHandler("data/dictionary.db")
        print("enter == to exit.. ")
        i = input("enter nepali unigram and meaning separated by comma: ")
        while i != '==':
            splitted = i.split(',')
            dicthandler.insert_unigram(splitted[0], tuple(splitted[1:len(splitted)]))
            i = input("enter nepali unigram and meaning separated by spaces: ")

        #print(dicthandler.get_english1('राम्रो'))
    except Exception as e:
        print(e)
Beispiel #3
0
def test_dict():
    try:
        dicthandler = DictionaryDBHandler("data/dictionary.db")
        print("enter == to exit.. ")
        i = input("enter nepali unigram and meaning separated by comma: ")
        while i != '==':
            splitted = i.split(',')
            dicthandler.insert_unigram(splitted[0],
                                       tuple(splitted[1:len(splitted)]))
            i = input("enter nepali unigram and meaning separated by spaces: ")

        #print(dicthandler.get_english1('राम्रो'))
    except Exception as e:
        print(e)
Beispiel #4
0
    def __init__(self, db):
        self.dict_handler = DictionaryDBHandler(db)

        rules = open("data/Rule.json", "r")
        rules = rules.read()
        self.tense_structures = json.loads(rules)

        suffices = open("data/suffices.json","r")
        self.suffices = json.loads(suffices.read())

        self.utility = Utility()
Beispiel #5
0
class RawTranslator(object):

    def __init__(self, db):
        self.dict_handler = DictionaryDBHandler(db)

        rules = open("data/Rule.json", "r")
        rules = rules.read()
        self.tense_structures = json.loads(rules)

        suffices = open("data/suffices.json","r")
        self.suffices = json.loads(suffices.read())

        self.utility = Utility()

    
    def translate(self, nepali_text):
        #try:
            # The whole portion of code below may be required to
            #   be modified later in other phases

            words = nepali_text.split()
            bigrams = [' '.join(words[x:x+2]) for x in range(len(words)-1)]

            # Check each ngram whether it is action or not 
            # In the first phase, we check for actions involving biphrases

            for i, item in enumerate(bigrams):
                eng_phrase = self.get_action(item) # checks bigram if it is action

                if eng_phrase is not None: # means phrase match found
                    # replace the phrase with english equivalent
                    nepali_text = re.sub(item, eng_phrase, nepali_text, 1)

            # now the biphrases are sustituted, we perform one by one translation of nepali words
            words = nepali_text.split()
            eng_words = []

            for x in words:
                if Utility.is_nepali(x): 
                    eng_meanings = self.dict_handler.get_english(x)

                    if len(eng_meanings) == 0:
                        # process for हरु, मा, बाट, ले, and so on
                        #print('in process suffix:')
                        temp = self.process_suffix(x)
                        if temp.strip()=='':
                            temp = self.further_process(x)
                        #print('processed :', type(temp), temp)
                        eng_words.append(temp)

                    else:
                        eng_word = '^^'.join(list(
                                map(lambda a: a.lower(), eng_meanings)
                                )
                            ) # CNF separated by ^^

                        eng_words.append(eng_word)
                else:
                    eng_words.append(x.lower())

            return ' '.join(eng_words)

        #except Exception as e:
            #print('Error in raw_translator: '+''.join(e.args))


    def get_action(self, nepali_phrase): # nepali phrase is bigram/unigram for now
        # Check if the bigram matches any form in our tenses    
        for simple_tense in self.tense_structures["Simple"]:
            for structure in self.tense_structures["Simple"][simple_tense]:
                
                struc_process = re.match('([^a-z]+)([a-z]*)', structure)
                structure_tags = struc_process.group(2)
                actual_str = struc_process.group(1)

                # Check if the phrase's last part matches fully with the structure
                # If so it is not only simple, check remaining first part too
                simple_result = re.search('[ ]+('+actual_str+')$', nepali_phrase)
                #print('checking: ', nepali_phrase, structure)
                neg = False
                if simple_result is not None:
                    #print('structure tags:', structure_tags)
                    if 'n' in structure_tags:
                        neg = True

                    # Search for match in the first part of the phrase
                    #first_part = nepali_phrase.split()[0]
                    remaining_part = re.sub(' ' +actual_str+'$', '', nepali_phrase)
                    # Now check in continuous, perfect and perfect continuous lists
                    for non_simple_tense in self.tense_structures["NonSimple"]:
                        # Here, full part won't match, so check partial
                        for each in self.tense_structures["NonSimple"][non_simple_tense]:
                            non_simple_result = re.search('(\S+)'+each+'$', remaining_part)
                            if non_simple_result is not None: 

                                #print("bibek", non_simple_tense, each, non_simple_result.group(1))
                                # Here, we have the verb root in non_simple_root, so extract verb
                                root_verb = self.utility.get_eng_verb(non_simple_result.group(1))
                                if not root_verb is None:

                                    return self.utility.get_tense(root_verb, simple_tense, non_simple_tense, negative=neg, singular='s' in structure_tags) 
                                # here return the correct tense of the verb

                    # it means no non-simple structure found like in म खतरा  छु।
                    # return the first part and tense of second part e.g return 'खतरा  am'
                    verb = self.utility.get_tense(simple_result.group(1), simple_tense, being_verb='b' in structure_tags) 
                    if verb is not None:
                        return remaining_part+' '+ verb
                    else: return None

        # Since non simple results not found, check simple only
        for simple_tense in self.tense_structures["Simple"]:
            for structure in self.tense_structures["Simple"][simple_tense]:
                
                neg = False

                struc_process = re.match('([^a-z]+)([a-z]*)', structure)
                structure_tags = struc_process.group(2)
                actual_str = struc_process.group(1)


                simple_result = re.search('(.+)'+actual_str+'$', nepali_phrase)
                #print(nepali_phrase, structure, structure in nepali_phrase)
                if simple_result is not None:
                    #print('match_simple')

                    if 'n' in structure_tags:
                        neg=True

                    # check for two possibilities: single word nep_verb
                    #                              double word nep_verb
                    nep_verb_part = simple_result.group(1)
                    if nep_verb_part=='':continue
                    root_verb = self.utility.get_eng_verb(simple_result.group(1)) # double_word
                    #print('root_verb: ', root_verb)
                    if root_verb is not None:
                        return self.utility.get_tense(root_verb, simple_tense, negative=neg, singular='s' in structure_tags) 
                        # return the correct tense of the verb
                    else:
                        root_verb = self.utility.get_eng_verb(simple_result.group(1).split()[-1] ) # single word verb
                        if root_verb is not None:
                            return nepali_phrase.split()[0]+ ' '+self.utility.get_tense(root_verb, simple_tense, negative=neg, singular='s' in structure_tags)
                        else:
                            return nepali_phrase#since no verb found, return as it is for other words may match it
                            raise RawTranslatorError('No verb found for  '+ nepali_phrase)
        return None


    def process_suffix(self, nepali_word):
        words = None
        suffx_meanings = None

        # check 'ko' at the end
        res = re.search('(\S+)को$', nepali_word)
        if res is not None:
            new_word = res.group(1)
            plural_result = self.utility.check_plural(new_word)
            if plural_result[0]:
                words = list(map(self.utility.plural, self.dict_handler.get_english(plural_result[1])))
                words = [x+"'" for x in words]
                return '^^'.join(words)
            else:
                return '^^'.join([x+"'s" for x in self.dict_handler.get_english(new_word)])

        # first check if हरु exist or not
        plural_result = self.utility.check_plural(nepali_word)
        if plural_result[0]:
            words = list(map(self.utility.plural, self.dict_handler.get_english(plural_result[1])))
            # if haru exist at last, return plural
            return '^^'.join(words)

        
        # if not then check for ले,बाट, tira, mathi, like naam 
        for suffix in self.suffices:
            res = re.search('(\S+)'+suffix.strip()+'$', nepali_word)
            if res is not None:
                new_word = res.group(1)
                suffx_meanings = self.suffices.get(suffix, '') # meanings of ले, तर्फ, etc
                # check for 'हरु' in new_word, which might still be present like in घरहरुले
                plural_result = self.utility.check_plural(new_word) 
                if plural_result[0]:
                    words = list(map(self.utility.plural, self.dict_handler.get_english(plural_result[1]))) 
                else:
                    words = self.dict_handler.get_english(new_word)

        if suffx_meanings is not None and words is not None:
            return '^^'.join(suffx_meanings) + ' ' + '^^'.join(words)
        elif words is not None:
            return '^^'.join(words)
        else: return ''

    def further_process(self, nepali_string):
        #print('in further process')
        # here, check for various possible structures of words 
        neg = ''

        # first check for imperative ones
        new = nepali_string

        new = re.sub('(होस्)$','',nepali_string)
        new = re.sub('(स्)$','', new)
        new = re.sub('(ऊ)$', 'उ', new) # like khau, jau, which have dirgha wookar
        if re.match('^न', new):
            neg = 'do not'
            new = re.sub('^न', '', new)
        #new = re.sub('(ू)$', 'ु', new) # like khau, jau, which have dirgha wookar

        r = self.utility.get_eng_verb(new)
        if r is not None and r is not '':
            return neg+' '+r

        # do search in non simple tenses
        for non_simple_tense in self.tense_structures["NonSimple"]:
            # Here, full part won't match, so check partial
            for each in self.tense_structures["NonSimple"][non_simple_tense]:
                non_simple_result = re.search('(\S+)'+each+'$', nepali_string)

                if non_simple_result is not None: 
                        # Here, we have the verb root in non_simple_root, so extract verb
                            root_verb = self.utility.get_eng_verb(non_simple_result.group(1))
                            if root_verb is not None:
                                return Utility.verb_tenses[root_verb][non_simple_tense]
Beispiel #6
0
class RawTranslator(object):
    def __init__(self, db):
        self.dict_handler = DictionaryDBHandler(db)

        rules = open("data/Rule.json", "r")
        rules = rules.read()
        self.tense_structures = json.loads(rules)

        suffices = open("data/suffices.json", "r")
        self.suffices = json.loads(suffices.read())

        self.utility = Utility()

    def translate(self, nepali_text):
        #try:
        # The whole portion of code below may be required to
        #   be modified later in other phases

        words = nepali_text.split()
        bigrams = [' '.join(words[x:x + 2]) for x in range(len(words) - 1)]

        # Check each ngram whether it is action or not
        # In the first phase, we check for actions involving biphrases

        for i, item in enumerate(bigrams):
            eng_phrase = self.get_action(item)  # checks bigram if it is action

            if eng_phrase is not None:  # means phrase match found
                # replace the phrase with english equivalent
                nepali_text = re.sub(item, eng_phrase, nepali_text, 1)

        # now the biphrases are sustituted, we perform one by one translation of nepali words
        words = nepali_text.split()
        eng_words = []

        for x in words:
            if Utility.is_nepali(x):
                eng_meanings = self.dict_handler.get_english(x)

                if len(eng_meanings) == 0:
                    # process for हरु, मा, बाट, ले, and so on
                    #print('in process suffix:')
                    temp = self.process_suffix(x)
                    if temp.strip() == '':
                        temp = self.further_process(x)
                    #print('processed :', type(temp), temp)
                    eng_words.append(temp)

                else:
                    eng_word = '^^'.join(
                        list(map(lambda a: a.lower(),
                                 eng_meanings)))  # CNF separated by ^^

                    eng_words.append(eng_word)
            else:
                eng_words.append(x.lower())

        return ' '.join(eng_words)

    #except Exception as e:
    #print('Error in raw_translator: '+''.join(e.args))

    def get_action(self,
                   nepali_phrase):  # nepali phrase is bigram/unigram for now
        # Check if the bigram matches any form in our tenses
        for simple_tense in self.tense_structures["Simple"]:
            for structure in self.tense_structures["Simple"][simple_tense]:

                struc_process = re.match('([^a-z]+)([a-z]*)', structure)
                structure_tags = struc_process.group(2)
                actual_str = struc_process.group(1)

                # Check if the phrase's last part matches fully with the structure
                # If so it is not only simple, check remaining first part too
                simple_result = re.search('[ ]+(' + actual_str + ')$',
                                          nepali_phrase)
                #print('checking: ', nepali_phrase, structure)
                neg = False
                if simple_result is not None:
                    #print('structure tags:', structure_tags)
                    if 'n' in structure_tags:
                        neg = True

                    # Search for match in the first part of the phrase
                    #first_part = nepali_phrase.split()[0]
                    remaining_part = re.sub(' ' + actual_str + '$', '',
                                            nepali_phrase)
                    # Now check in continuous, perfect and perfect continuous lists
                    for non_simple_tense in self.tense_structures["NonSimple"]:
                        # Here, full part won't match, so check partial
                        for each in self.tense_structures["NonSimple"][
                                non_simple_tense]:
                            non_simple_result = re.search(
                                '(\S+)' + each + '$', remaining_part)
                            if non_simple_result is not None:

                                #print("bibek", non_simple_tense, each, non_simple_result.group(1))
                                # Here, we have the verb root in non_simple_root, so extract verb
                                root_verb = self.utility.get_eng_verb(
                                    non_simple_result.group(1))
                                if not root_verb is None:

                                    return self.utility.get_tense(
                                        root_verb,
                                        simple_tense,
                                        non_simple_tense,
                                        negative=neg,
                                        singular='s' in structure_tags)
                                # here return the correct tense of the verb

                    # it means no non-simple structure found like in म खतरा  छु।
                    # return the first part and tense of second part e.g return 'खतरा  am'
                    verb = self.utility.get_tense(simple_result.group(1),
                                                  simple_tense,
                                                  being_verb='b'
                                                  in structure_tags)
                    if verb is not None:
                        return remaining_part + ' ' + verb
                    else:
                        return None

        # Since non simple results not found, check simple only
        for simple_tense in self.tense_structures["Simple"]:
            for structure in self.tense_structures["Simple"][simple_tense]:

                neg = False

                struc_process = re.match('([^a-z]+)([a-z]*)', structure)
                structure_tags = struc_process.group(2)
                actual_str = struc_process.group(1)

                simple_result = re.search('(.+)' + actual_str + '$',
                                          nepali_phrase)
                #print(nepali_phrase, structure, structure in nepali_phrase)
                if simple_result is not None:
                    #print('match_simple')

                    if 'n' in structure_tags:
                        neg = True

                    # check for two possibilities: single word nep_verb
                    #                              double word nep_verb
                    nep_verb_part = simple_result.group(1)
                    if nep_verb_part == '': continue
                    root_verb = self.utility.get_eng_verb(
                        simple_result.group(1))  # double_word
                    #print('root_verb: ', root_verb)
                    if root_verb is not None:
                        return self.utility.get_tense(root_verb,
                                                      simple_tense,
                                                      negative=neg,
                                                      singular='s'
                                                      in structure_tags)
                        # return the correct tense of the verb
                    else:
                        root_verb = self.utility.get_eng_verb(
                            simple_result.group(1).split()
                            [-1])  # single word verb
                        if root_verb is not None:
                            return nepali_phrase.split(
                            )[0] + ' ' + self.utility.get_tense(
                                root_verb,
                                simple_tense,
                                negative=neg,
                                singular='s' in structure_tags)
                        else:
                            return nepali_phrase  #since no verb found, return as it is for other words may match it
                            raise RawTranslatorError('No verb found for  ' +
                                                     nepali_phrase)
        return None

    def process_suffix(self, nepali_word):
        words = None
        suffx_meanings = None

        # check 'ko' at the end
        res = re.search('(\S+)को$', nepali_word)
        if res is not None:
            new_word = res.group(1)
            plural_result = self.utility.check_plural(new_word)
            if plural_result[0]:
                words = list(
                    map(self.utility.plural,
                        self.dict_handler.get_english(plural_result[1])))
                words = [x + "'" for x in words]
                return '^^'.join(words)
            else:
                return '^^'.join([
                    x + "'s" for x in self.dict_handler.get_english(new_word)
                ])

        # first check if हरु exist or not
        plural_result = self.utility.check_plural(nepali_word)
        if plural_result[0]:
            words = list(
                map(self.utility.plural,
                    self.dict_handler.get_english(plural_result[1])))
            # if haru exist at last, return plural
            return '^^'.join(words)

        # if not then check for ले,बाट, tira, mathi, like naam
        for suffix in self.suffices:
            res = re.search('(\S+)' + suffix.strip() + '$', nepali_word)
            if res is not None:
                new_word = res.group(1)
                suffx_meanings = self.suffices.get(
                    suffix, '')  # meanings of ले, तर्फ, etc
                # check for 'हरु' in new_word, which might still be present like in घरहरुले
                plural_result = self.utility.check_plural(new_word)
                if plural_result[0]:
                    words = list(
                        map(self.utility.plural,
                            self.dict_handler.get_english(plural_result[1])))
                else:
                    words = self.dict_handler.get_english(new_word)

        if suffx_meanings is not None and words is not None:
            return '^^'.join(suffx_meanings) + ' ' + '^^'.join(words)
        elif words is not None:
            return '^^'.join(words)
        else:
            return ''

    def further_process(self, nepali_string):
        #print('in further process')
        # here, check for various possible structures of words
        neg = ''

        # first check for imperative ones
        new = nepali_string

        new = re.sub('(होस्)$', '', nepali_string)
        new = re.sub('(स्)$', '', new)
        new = re.sub('(ऊ)$', 'उ',
                     new)  # like khau, jau, which have dirgha wookar
        if re.match('^न', new):
            neg = 'do not'
            new = re.sub('^न', '', new)
        #new = re.sub('(ू)$', 'ु', new) # like khau, jau, which have dirgha wookar

        r = self.utility.get_eng_verb(new)
        if r is not None and r is not '':
            return neg + ' ' + r

        # do search in non simple tenses
        for non_simple_tense in self.tense_structures["NonSimple"]:
            # Here, full part won't match, so check partial
            for each in self.tense_structures["NonSimple"][non_simple_tense]:
                non_simple_result = re.search('(\S+)' + each + '$',
                                              nepali_string)

                if non_simple_result is not None:
                    # Here, we have the verb root in non_simple_root, so extract verb
                    root_verb = self.utility.get_eng_verb(
                        non_simple_result.group(1))
                    if root_verb is not None:
                        return Utility.verb_tenses[root_verb][non_simple_tense]