コード例 #1
0
 def correct_word(self, word):
     for i in range(len(word)):
         if not isletter(word[i]):
             if not len(word[:i]) == 0:     
                 logp, path = self.model.viterbi(word[:i])
                 for idx, state in path:
                     if (state.name != "Mispelling-start") and (state.name != "Mispelling-end"):
                         self.txt.append(state.name.strip())
                 self.txt.append(word[i])
             if len(word[i+1:]) == 0:
                 self.txt.append(' ')
             else:
                 self.correct_word(word[i+1:])
             return  
     if not len(word) == 0:
         logp, path = self.model.viterbi(word)
         for idx, state in path:
             if (state.name != "Mispelling-start") and (state.name != "Mispelling-end"): 
                 self.txt.append(state.name.strip())
         self.txt.append(" ")
         
         
         
         
         
         
コード例 #2
0
    def perturbate_tweets(self):

        print "Start perturbation"

        riscrittura = []
        with open('csv\lp_tweets.csv', 'rb') as r:
            reader = csv.reader(r)
            for line in reader:
                tweet = line[0]
                for i in range(len(tweet)):
                    if isletter(tweet[i]):
                        r = random.random()
                        if r < 0.1:
                            r_index = random.randint(
                                0,
                                len(self.error_list[ord(tweet[i]) - 97]) - 1)
                            tweet = tweet[:i] + self.error_list[
                                ord(tweet[i]) - 97][r_index] + tweet[i + 1:]
                    if i == len(tweet) - 1:
                        riscrittura.append(tweet)

        with open('csv\perturbation_tweets.csv', 'wb') as w:
            writer = csv.writer(w, delimiter='\n')
            writer.writerows([riscrittura])

        print "End perturbation"
コード例 #3
0
ファイル: hmm.py プロジェクト: andreaangiolillo/Mispelling
 def correct_word(self, word):
     for i in range(len(word)):
         if not isletter(word[i]):
             if not len(word[:i]) == 0:     
                 logp, path = self.model.viterbi(word[:i])
                 for idx, state in path:
                     if (state.name != "Mispelling-start") and (state.name != "Mispelling-end"):
                         self.txt.append(state.name.strip())
                 self.txt.append(word[i])
             if len(word[i+1:]) == 0:
                 self.txt.append(' ')
             else:
                 self.correct_word(word[i+1:])
             return  
     if not len(word) == 0:
         logp, path = self.model.viterbi(word)
         for idx, state in path:
             if (state.name != "Mispelling-start") and (state.name != "Mispelling-end"): 
                 self.txt.append(state.name.strip())
         self.txt.append(" ")
         
         
         
         
         
         
コード例 #4
0
def observations_p(cleaned_tweets, perturbated_tweets):
    print "calcolo probabilita' delle osservazioni"

    #PROVA POI SE RIESCI A FARLO CON  UNA STINGA UNICA
    clean_string = file_to_string(cleaned_tweets)
    pert_string = file_to_string(perturbated_tweets)
    #l_clean = count_legth(cleaned_tweets)
    #l_pert = count_legth(perturbated_tweets)
    #print "file in stringa"
    #print string_clean
    #print string_pert

    if len(clean_string) == len(
            pert_string
    ):  #potremmo togliere questo controllo se ci fidiamo, risparimiamo 2n di computazione
        for i in range(
                len(clean_string
                    )):  #per ogni char controllo se sono uguali tra i due file
            if isletter(
                    clean_string[i]
            ):  #controllo se sono lettere (se dal parse tolgo i numeri posso toglierlo)
                obs_matrix[ord(clean_string[i]) -
                           97][ord(pert_string[i]) -
                               97] += 1  #altrimenti non fare nulla
            """
            if clean_string[i] == pert_string[i]: #se coincidono incremento sulla diagonale
                if ground_truth.is_letter(clean_string): #controllo se sono lettere (se dal parse tolgo i numeri posso toglierlo)
                    obs_matrix[ord(clean_string)-97][ord(pert_string)-97] += 1 #altrimenti non fare nulla
            else: #se non coincidono incrementa la colonna corrispondente
                obs_matrix[ord(clean)] """
    else:
        print "ERROR: le lunghezze dei due file non coincidono"

    print "finito di calcolare matrice di osservazioni: "
    for line in obs_matrix:
        print line

    for i in range(len(obs_matrix)):
        counter = 0.0
        for j in range(len(obs_matrix[i])):
            counter += obs_matrix[i][j]
        if not counter == 0:
            for j in range(len(obs_matrix[i])):
                obs_matrix[i][j] = float(
                    obs_matrix[i]
                    [j]) / counter  #round(transition_p[i][j]/(counter), 4)

    print "matrice di probabilita' di osservazioni: "
    for line in obs_matrix:
        print line
コード例 #5
0
def observations_p(cleaned_tweets, perturbated_tweets):
    print "calcolo probabilita' delle osservazioni"


    #PROVA POI SE RIESCI A FARLO CON  UNA STINGA UNICA
    clean_string = file_to_string(cleaned_tweets)
    pert_string = file_to_string(perturbated_tweets)
    #l_clean = count_legth(cleaned_tweets)
    #l_pert = count_legth(perturbated_tweets)
    #print "file in stringa"
    #print string_clean
    #print string_pert
     
    if len(clean_string) == len(pert_string): #potremmo togliere questo controllo se ci fidiamo, risparimiamo 2n di computazione
        for i in range(len(clean_string)): #per ogni char controllo se sono uguali tra i due file
            if isletter(clean_string[i]): #controllo se sono lettere (se dal parse tolgo i numeri posso toglierlo)
                obs_matrix[ord(clean_string[i])-97][ord(pert_string[i])-97] += 1 #altrimenti non fare nulla
            """
            if clean_string[i] == pert_string[i]: #se coincidono incremento sulla diagonale
                if ground_truth.is_letter(clean_string): #controllo se sono lettere (se dal parse tolgo i numeri posso toglierlo)
                    obs_matrix[ord(clean_string)-97][ord(pert_string)-97] += 1 #altrimenti non fare nulla
            else: #se non coincidono incrementa la colonna corrispondente
                obs_matrix[ord(clean)] """  
    else:
        print "ERROR: le lunghezze dei due file non coincidono"
    
    
    print "finito di calcolare matrice di osservazioni: "
    for line in obs_matrix:
        print line

    for i in range(len(obs_matrix)):
        counter = 0.0
        for j in range(len(obs_matrix[i])):
            counter += obs_matrix[i][j]
        if not counter == 0:
            for j in range(len(obs_matrix[i])):
                obs_matrix[i][j] = float(obs_matrix[i][j])/counter #round(transition_p[i][j]/(counter), 4)
                
    print "matrice di probabilita' di osservazioni: "
    for line in obs_matrix:
        print line
コード例 #6
0
def perturbate_tweets(): 
    riscrittura = []
    with open('csv\clean_tweets.csv', 'rb') as r:
        reader = csv.reader(r)
        for line in reader:
            tweet = line[0]
            for i in range(len(tweet)):
                if isletter(tweet[i]):
                    r = random.random()
                    if r < 0.1:
                        r_index = random.randint(0, len(error_list[ord(tweet[i])-97]) - 1)
                        tweet = tweet[:i] + tweetToCsv.error_list[ord(tweet[i])-97][r_index] + tweet[i+1:]
                if i == len(tweet)-1:
                    riscrittura.append(tweet)
     
    with open('csv\perturbation_tweets.csv', 'wb') as w:
        writer = csv.writer(w, delimiter='\n')
        writer.writerows([riscrittura])               
                    
    print "finito"