Ejemplo n.º 1
0
def correctWord_sc(input_word):
    input_word = input_word.upper()
    start = time.time()
    cands = trie.search(input_word, 4)
    # print "Got results from trie" + str(len(cands))
    edit = {}
    prob = {}
    for i in cands:
        dis, pro = nc.editDistance(input_word, i[0])
        if dis < 4:
            edit[i[0]] = dis
            prob[(i[0], pro, prior[i[0]])] = pro * prior[i[0]]

            # print "Got the probs"
    sorted_x = sorted(prob.items(), key=operator.itemgetter(1), reverse=True)
    end = time.time()
    # print input_word + '\t',
    normalization = 0
    for i in range(min(10, len(sorted_x))):
        normalization += sorted_x[i][1]

    output_dict = {}
    for i in range(min(10, len(sorted_x))):
        output_dict[sorted_x[i][0][0]] = sorted_x[i][1] / normalization
    return output_dict
Ejemplo n.º 2
0
def import_to_sunpinyin_user_dict (records, userdict_path=''):
    userdict_path = userdict_path if userdict_path else get_userdict_path()
    db = sqlite.connect (userdict_path)

    sysdict = imdict.IMDict("dict.utf8")

    sqlstring = """
            CREATE TABLE IF NOT EXISTS dict(
            id INTEGER PRIMARY KEY, len INTEGER,
            i0 INTEGER, i1 INTEGER, i2 INTEGER, i3 INTEGER, i4 INTEGER, i5 INTEGER,
            f0 INTEGER, f1 INTEGER, f2 INTEGER, f3 INTEGER, f4 INTEGER, f5 INTEGER,
            utf8str TEXT, UNIQUE (utf8str));
            """
    db.executescript (sqlstring)

    for (pystr, utf8str) in records:
        try:
            syllables = [valid_syllables[s] for s in pystr.split("'")]
        except:
            print "[%s] has un-recognized syllables, ignoring this record!" % pystr
            continue

        if len (syllables) < 2 or len (syllables) > 6:
            #print "[%s] is too long or too short for sunpinyin userdict" % utf8str
            continue

        if sysdict and trie.search (sysdict, utf8str):
            #print "[%s] is already in sunpinyin's sysdict" % utf8str
            continue

        record = [0]*14
        record[0] = len (syllables)
        record[13] = utf8str

        c = 1
        for s in syllables:
            i, f = s>>12, (s&0x00ff0)>>4
            if i and not f:
                break; 
            record[c] = i
            record[c+1] = f
            c += 2
        else:
            sqlstring = """
                    INSERT INTO dict (len, i0, f0, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, utf8str)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
                    """
            try:
                db.execute (sqlstring, record)
                db.commit ()
                print "[%s] is imported into sunpinyin's userdict" % utf8str
            except:
                #print "[%s] is already in sunpinyin's userdict" % utf8str
                pass

    db.close()
Ejemplo n.º 3
0
def odpowiedz(trie, query):
    for i in range(MAX_COST):
        rets = search(trie, query, i)
        if rets:
            break
    if rets:
        ret = random.choice(random.choice(rets)[-1])
    else:
        ret = random.choice(moje)
    ret = censor(ret)
    return ret
    def _get_sentiment_orientation(self, sent_word: str,
                                   sub_sentence: str) -> str:
        ''' Recupera a orientação de uma palavra de sentimento e verifica se houve inversão.'''

        modifiers = [
            'nao', 'nao e', 'jamais', 'nada', 'nem', 'nenhum', 'ninguem',
            'nunca', 'tampouco', 'sem'
        ]

        #recupera da hash o sentimento correspondente
        orientation = trie.search(self.sent_words, sent_word)[1]
        #verifica se existe um modificador no trecho entre o aspecto e a palavra de sentimento
        for modifier in modifiers:
            if modifier in sub_sentence:
                #inverte orientacao
                orientation = '+' if orientation == '-' else '-'
                #finaliza checagem
                break
        return orientation
    def _extract_keywords(self, split_sentence: list):
        '''
            Percorre a frase extraindo os aspectos e as palavras de sentimento.
            
            return : Uma lista contendo a lista de aspectos na primeira posição 
                     e uma lista de palavras de sentimento na segunda.
                     Cada item da lista de aspecto/sentimento contem um par com
                     a palavra e sua posicao na frase.
        '''

        aspects = []
        rev_sent_words = []
        for i in range(len(split_sentence)):
            word = split_sentence[i]
            if word in self.explicit_aspects1:
                aspects.append([word, i, i, ''])
            elif word in self.implicit_aspects1:
                aspects.append(
                    [self.implicit_aspects1[word], i, i, 'implicit'])
            if trie.search(self.sent_words, word)[0]:
                rev_sent_words.append([word, i, i])
        return [aspects, rev_sent_words]
Ejemplo n.º 6
0
import Levenshtein as lev
import operator
import trie
import pickle
import time
from metaphone import doublemetaphone

string = 'data/prior.txt'
with open(string, 'rb') as f:
	prior = pickle.load(f)
while True:
	# print "Please give the word:"
	input = raw_input()
	input = input.upper()
	start = time.time()
	cands = trie.search(input, 4)
	# print "Got results from trie" + str(len(cands))
	edit = {}
	prob = {}
	for i in cands:
		input_ph = doublemetaphone(input)
		word_ph = doublemetaphone(i[0])
		maxphval = -1
		
		if input_ph[0]!='' and word_ph[0]!='':
		    phonetic_val = lev.distance(input_ph[0],word_ph[0])
		    if maxphval<phonetic_val:
		        maxphval = phonetic_val
		
		if input_ph[0]!='' and word_ph[1]!= '':
		    phonetic_val = lev.distance(input_ph[0],word_ph[1])