Exemple #1
0
def match_using_edit_distance(word_1, word_2):
    """
    Calculate de Levenshtein distance between two words,
    if the distance is less than a threshold is considered a match.
    The Levenshtein distance is the number of edits that are require to change one word into the other.
    It's only calculate de distance if the sum of the length of the two words is greater than 6.

    :param word_1: (String, default none) A word of any size.
    :param word_2: (String, default none) A word of any size.
    :return:
    A Boolean, True if was considered a match, False otherwise.
    """
    len_word_1 = len(word_1)
    len_word_2 = len(word_2)

    if len_word_1 + len_word_2 > 6:
        threshold = np.floor((len_word_1 + len_word_2) *
                             0.2)  # Malleable threshold for larger words

        if threshold < 2:
            threshold = 2

        return edit_distance(word_1, word_2) <= threshold

    else:
        return edit_distance(word_1, word_2) == 0
Exemple #2
0
def get_similar_names(team_name, all_team_names):
    similar_team_names = []

    # Go through the mapping
    for name in NAME_MAPPING:
        if name.startswith(team_name):
            similar_team_names.append(NAME_MAPPING[name])

        if team_name[0] == name[0] and nltk.edit_distance(name,
                                                          team_name) <= 2:
            # If no more than 2 substitution, insertion or deletion are needed to obtain one string starting with the other
            # AND if first letter are the same
            similar_team_names.append(NAME_MAPPING[name])

    # Go through all_team_names
    for name in all_team_names:
        if name.startswith(team_name):
            similar_team_names.append(name)

        if team_name[0] == name[0] and nltk.edit_distance(name,
                                                          team_name) <= 2:
            # If no more than 2 substitution, insertion or deletion are needed to obtain one string starting with the other
            # AND if first letter are the same

            similar_team_names.append(name)

    return similar_team_names
Exemple #3
0
    def _prune_suggestions_using_editdist_dm(self, word, suggested_corrections):
        suggested_corrections = map(lambda suggestion: suggestion.lower(), suggested_corrections)
        suggested_corrections = filter(lambda suggestion: suggestion[0] == word[0], suggested_corrections)

        _suggestions = []
        for suggested_word in suggested_corrections:
            e_distance = nltk.edit_distance(word, suggested_word)
            if e_distance <= SpellCorrector.MAX_EDIT_DISTANCE_THRESHOLD:
                _suggestions.append((suggested_word, e_distance))

        suggested_corrections = _suggestions
        _suggestions = []

        word_dms = doublemetaphone(word)  # doublemetaphones of word
        for suggested_word, e_distance in suggested_corrections:
            suggested_word_dms = doublemetaphone(suggested_word)
            dme_distance = 1000
            for dm in word_dms:
                for sw_dm in suggested_word_dms:
                    dme_distance = min(dme_distance, nltk.edit_distance(dm, sw_dm))
            if dme_distance <= SpellCorrector.DOUBLE_METAPHONE_MAX_EDIT_DISTANCE_THRESHOLD:
                _suggestions.append((suggested_word, (e_distance, dme_distance)))

        suggested_corrections = _suggestions
        _suggestions = []
        for suggested_word, (e_distance, dme_distance) in suggested_corrections:
            freq = english_words.get(suggested_word, len(english_words) + 1)
            _suggestions.append((suggested_word, freq))

        # Reranked on usage
        suggested_corrections = sorted(_suggestions, key=lambda x: x[1])
        suggested_corrections = [suggested_word for suggested_word, _ in suggested_corrections]

        return suggested_corrections
    def closeEnough(self, strToCmp, errors):
        names = self._cats.loc[:, 'enname'].to_list()
        names = [str(x).lower() for x in names]
        # edit distance of everything in the tsv
        dss = list(map(lambda x: nl.edit_distance(x, strToCmp), names))

        closest = [i for i, x in enumerate(dss) if x == min(dss)]

        # from dictionary
        distancedict = defaultdict(list)
        for i in self._customnames:
            distancedict[nl.edit_distance(strToCmp, i.lower())].append(
                self._customnames[i])
        customnames = []
        try:
            customnames = min(distancedict.items())
        except ValueError:  # empty custom names
            customnames.append(errors + 1)
        if min(dss) > errors and customnames[0] > errors:  # both were too bad
            return None
        if min(dss) < customnames[0]:  # normal names were better
            return [closest, min(dss)
                    ]  # all of the closest and the distance of the closests
        else:  # custom names were better
            return [customnames[1],
                    customnames[0]]  # the best matches of all custom names
def similarityToSystemHeader(headerName):
    mindistance = [0, nltk.edit_distance(system_libraries[0], headerName[1])]
    for i in range(2, len(system_libraries), 2):
        tmp = nltk.edit_distance(system_libraries[i], headerName[1])
        if mindistance[1] > tmp:
            mindistance = [i, tmp]
    return system_libraries[mindistance[0]]
    def typing_error(self, theta1, theta2):
        '''
		typing error 체크.
		:param sentences: (list): list of strings
		:param theta: (float): 오류 판정 threshold
		:return return_pairs: (list): ori 문장-수정된 문장 페어 리스트
		'''
        error_token = set()
        sep = ':-:'
        min_edit = 4

        tokens = nltk.word_tokenize(self.sentence)
        refined_tokens = []
        for token in tokens:
            if any(char.isdigit() for char in token):
                continue
            else:
                refined_tokens.append(token)
        for trigram in ngrams(refined_tokens, 3):
            given_phrase = sep.join(trigram)
            pre_tok, target_tok, next_tok = trigram[0], trigram[1], trigram[2]
            total_freq = 0
            try:
                for predict_tok in tri_vocab[pre_tok + sep + next_tok]:
                    predict_phrase = pre_tok + sep + predict_tok + sep + next_tok
                    total_freq = total_freq + tri_frequency[predict_phrase]
                # if target_tok == 'Maderote':
                # 	print(pre_tok, next_tok)

                ambiguity = len(tri_vocab[pre_tok + sep + next_tok])
                theta = theta1
                for predict_tok in tri_vocab[pre_tok + sep + next_tok]:
                    predict_phrase = pre_tok + sep + predict_tok + sep + next_tok
                    if predict_phrase != given_phrase:
                        prob = round(
                            tri_frequency[predict_phrase] / float(total_freq),
                            5)
                        if nltk.edit_distance(target_tok,
                                              predict_tok) < len(target_tok):
                            if nltk.edit_distance(target_tok,
                                                  predict_tok) < min_edit:
                                if total_freq < 5:
                                    error_token.add(target_tok)
                                else:
                                    if ambiguity >= 5:
                                        theta = theta2
                                    # elif ambiguity >= 3:
                                    # 	theta = 0.5
                                    if prob >= theta:
                                        error_token.add(target_tok)

            except KeyError as k:
                # print(k)
                # print(pre_tok + sep + next_tok)
                pass

        return error_token
Exemple #7
0
def func(word):
    min=999999999
    output=""
    for i in range(X.shape[0]):
        if(nltk.edit_distance(X[i][0], word)<min):
            min=nltk.edit_distance(X[i][0], word)
            output=X[i][0]

    return output
Exemple #8
0
def outputResults(text, code):
    with open('test/text{}.txt'.format(sys.argv[1]), 'r') as f:
        data = f.read()
        print('DATA:\n' + data)
        print('Difference between tesseract and real text is:')
        print(nltk.edit_distance(text, data))
        print('Difference between optimization and real text is:')
        print(nltk.edit_distance(code, data))
        for i in range(len(code)):
            if code[i] != data[i]:
                print(code[i] + '\t\t\t' + data[i])
Exemple #9
0
def Similarity(var,
               fields=[
                   'Passpord Card no', 'Nationality', 'Surname', 'Given Names',
                   'Sex', 'Date of Birth', 'Place of Birth'
               ]):
    min = len(var) / 3
    fieldMin = ' '
    for field in fields:
        if nltk.edit_distance(field, var) <= min or field.__contains__(var):
            min = nltk.edit_distance(field, var)
            fieldMin = field
    return fieldMin
Exemple #10
0
def similarity(
    var,
    fields=[
        'Passpord Card no', 'Nationality', 'Surname', 'Given Names', 'Sex',
        'Date of Birth', 'Place of Birth'
    ]):  # gets some string and returns the field name similar to it if any
    min = len(var) / 3
    field_min = ' '
    for field in fields:
        if nltk.edit_distance(field, var) <= min or field.__contains__(var):
            min = nltk.edit_distance(field, var)
            field_min = field
    return field_min
Exemple #11
0
    def findNearestWords(self, wordlist, actualWord, tag):
        distance = []
        for term in wordlist:
            if term[0][0] not in string.punctuation:
                wordTowordDistance = nltk.edit_distance(actualWord, term[0].lower())
                lemma = self.lemmatizer.lemmatize(term[0].lower(), self.get_wordnet_pos(tag))
                lemmaTowordDistance = nltk.edit_distance(actualWord, lemma)
                if wordTowordDistance <= self.maxDistance:
                    distance.append([term[0].lower(), wordTowordDistance])
                elif lemmaTowordDistance <= self.maxDistance:
                    distance.append([lemma, lemmaTowordDistance])

        distance.sort(key=lambda x: x[1])
        return distance
Exemple #12
0
def getDiseases(inputText):
    #in  list of words text check whether a given word is present or not
    #read disease name from 2 databases 
    Data  = pd.read_csv('Disease-symptoms-medication-dataset.csv')
    allDiseases = list(Data['Disease'])
    answer=[]
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in inputText:
        word_new = wordnet_lemmatizer.lemmatize(word)
        for diseases in allDiseases:
            if word == diseases or word_new==diseases and diseases not in answer:
                answer.append(diseases)
            if (nltk.edit_distance(word,diseases) <= 2 or nltk.edit_distance(word_new,diseases) <=2) and diseases not in answer:
                answer.append(diseases)
    return answer
Exemple #13
0
def get_min_lingual_distance(w, lst, pos=None):
    if not isinstance(lst, list): lst = [lst]
    if len(lst) <= 0: return w
    best = (lst[0], get_distance(w, lst[0], pos=pos)[0])
    for i in range(1, len(lst)):
        dst = get_distance(w, lst[i], pos=pos)
        if dst[0] < best[1]: best = (lst[i], dst[0])
        elif dst[0] == best[
                1]:  # if the distance is the same (most often 0 and 0)...
            # if edit distance is less, that's the better option
            if nltk.edit_distance(best[0], w) / max(len(
                    best[0]), 1) > nltk.edit_distance(lst[i], w) / max(
                        len(lst[i]), 1):
                best = (lst[i], dst[0])
    return best
Exemple #14
0
 def nametoenemies(
         self, stringtosearch,
         errors):  # TODO refine failing to get data for whatever reason
     results = None
     try:
         conn = sqlite3.connect('stages.db')
         cursor = conn.cursor()
         query = '''select * from searchunitstages'''  # TODO this is going to change later
         stagenames = cursor.execute(query).fetchall()
         stagenames = [x[0].lower() for x in stagenames]
         dss = list(
             map(lambda x: nl.edit_distance(x, stringtosearch), stagenames))
         if min(dss) > errors:
             results = -1
             raise Exception('String could not match anything.')
         nearestmatch = [i for i, x in enumerate(dss) if x == min(dss)]
         if len(nearestmatch) > 1:
             results = -2
             raise Exception('Could not discriminate.')
         else:
             nearestmatch = [(stagenames[nearestmatch[0]])]
             cursor.execute(
                 'SELECT * from enemylines, stage where stage.stage_id=enemylines.stage_appearance and '
                 'LOWER(name)=?', nearestmatch)
             results = cursor.fetchall()
     except:
         print('something went wrong')
     finally:
         conn.close()
         return results
Exemple #15
0
def query_list_of_words(target_word, list_of_words, edit_distance=1):
    """
    Checks whether a target word is within editing distance of any one in a set of keywords.

    Inputs: - target_word: A string containing the word we want to search in a list.
            - list_of_words: A python list of words.
            - edit_distance: For larger words, we also check for similar words based on edit_distance.

    Outputs: - new_list_of_words: This is the input list of words minus any found keywords.
             - found_list_of_words: This is the list of words that are within edit distance of the target word.
    """
    # Initialize lists
    new_list_of_words = list()
    found_list_of_words = list()

    append_left_keyword = new_list_of_words.append
    append_found_keyword = found_list_of_words.append

    # Iterate over the list of words
    for word in list_of_words:
        if len(word) > 6:
            effective_edit_distance = edit_distance
        else:
            effective_edit_distance = 0  # No edit distance for small words.
        if abs(len(word)-len(target_word)) <= effective_edit_distance:
            if nltk.edit_distance(word, target_word) <= effective_edit_distance:
                append_found_keyword(word)
            else:
                append_left_keyword(word)
        else:
            append_left_keyword(word)

    return new_list_of_words, found_list_of_words
Exemple #16
0
def getSuggestion():
    sugg_final = []
    wordReq = request.args['word']
    suggest = []
    suggestFreq = []
    lis = []
    lisFreq = []
    key_list = []

    # if to check for key-----------------------------------
    if find_language(wordReq) == 'tamil':
        for x in range(-2, 3):
            inp_key = getAccKey(wordReq, x, checkLettersTamil, 'tamil')
            key_list.append(inp_key)
        for key in key_list:
            if key in hashtable_ta:
                thisKeyList = [e['word'] for e in hashtable_ta[key]]
                thisKeyListFreq = [f['freq'] for f in hashtable_ta[key]]
                for word in thisKeyList:
                    lis.append(word)
                    lisFreq.append(
                        float(thisKeyListFreq[thisKeyList.index(word)]))
    elif find_language(wordReq) == 'sinhala':
        for x in range(-2, 3):
            inp_key = getAccKey(wordReq, x, checkLettersTamil, 'sinhala')
            key_list.append(inp_key)
        for key in key_list:
            if key in hashtable_si:
                thisKeyList = [e['word'] for e in hashtable_si[key]]
                thisKeyListFreq = [f['freq'] for f in hashtable_si[key]]
                for word in thisKeyList:
                    lis.append(word)
                    lisFreq.append(
                        int(thisKeyListFreq[thisKeyList.index(word)]))
    else:
        pass

    for word in lis:
        ed = nltk.edit_distance(wordReq, word)
        if (ed <= 2):
            suggest.append(word)
            suggestFreq.append(lisFreq[lis.index(word)])
            ng_list = list(ngrams(wordReq, 2))

    sugg_dic = []

    for word in suggest:
        l = list(ngrams(word, 2))
        lis = set(l + ng_list)
        val = len(list(set(ng_list) & set(l)))
        value = val / (len(lis))
        sugg_dic.append(
            (word, value, value * (suggestFreq[suggest.index(word)])))
    sugg_dic.sort(key=takeSecond)
    sorted_x = sugg_dic[-10:]
    sorted_x.sort(key=takeThird, reverse=True)
    for sugg in sorted_x:
        sugg_final.append(sugg[0])

    return jsonify({'suggestion': sugg_final})
 def test_replace(self):
     s = 'replace a character?'
     res = self.mistakes.replace(s)
     print(f'swap: `{s}` -> `{res}`')
     self.assertEqual(len(s), len(res))
     self.assertLessEqual(edit_distance(s, res), 1)
     self.assertEqual(self.mistakes.replace(''), '')
    def test_delete(self):
        s = 'abcdefgh'

        position = 3
        res = self.mistakes.delete(s, start=position, end=position)
        print(f'delete: `{s}` -> `{res}`')
        self.assertEqual(res, s[:position] + s[position + 1:])
        self.assertEqual(edit_distance(s, res), 1)

        res = self.mistakes.delete(s)
        print(f'delete: `{s}` -> `{res}`')
        self.assertEqual(edit_distance(s, res), 1)
        self.assertEqual(len(s) - 1, len(res))
        self.assertEqual(edit_distance(s, res), 1)

        self.assertEqual(self.mistakes.delete(''), '')
Exemple #19
0
def similarity(string1, string2):
    len1 = float(len(string1))
    len2 = float(len(string2))
    lensum = len1 + len2
    levdist = float(nltk.edit_distance(string1, string2))
    similarityMetric = ((lensum - levdist) / lensum)
    return similarityMetric 
def answer_eleven(entries=['cormulent', 'incendenece', 'validrate']):
    recommendations = []
    for e in entries:
        distances = [(nltk.edit_distance(e, a), a) for a in correct_spellings
                     if a[0] == e[0] and len(a) > 2]
        recommendations.append(sorted(distances)[0][1])
    return recommendations  # Your answer here
Exemple #21
0
 def word_jaccard(self, w1, w2):
     cw1 = [char.lower() for char in w1]
     cw2 = [char.lower() for char in w2]
     
     c = max(len(cw1), len(cw2)) - abs(nltk.edit_distance(w1, w2))
     
     return float(c) / (len(cw1) + len(cw2) - c)
Exemple #22
0
def main():
    test_file = sys.argv[1]
    output_file = sys.argv[2]
    THRESHOLD = 20.7782

    # Testing data
    df_test = read_csv(test_file)
    df_test['title1_zh'] = df_test['title1_zh'].fillna('')
    df_test['title2_zh'] = df_test['title2_zh'].fillna('')

    X_1_test = df_test['title1_zh'].tolist()
    X_2_test = df_test['title2_zh'].tolist()
    id_test = df_test['id'].tolist()

    # Predict
    y_pred = []
    for i in range(len(X_1_test)):
        dist = edit_distance(X_1_test[i], X_2_test[i])
        if dist > THRESHOLD:
            y_pred.append('unrelated')
        else:
            y_pred.append('agreed')

    # Export
    with open(output_file, 'w') as file:
        file.write('Id,Category\n')
        for i in range(len(y_pred)):
            file.write(f'{id_test[i]},{y_pred[i]}\n')
Exemple #23
0
def get_common_letters(values):
    '''
    find the lowest edit distance and get similarity

    :param values: list of words
    :return: string containing the similarities
    '''
    distance = None
    id1, id2 = None, None
    # calculate all distances
    for i, w_id1 in enumerate(values):
        for w_id2 in values[i+1:]:
            # since the pre condition was that just one pair has a distance of one
            # cut the computational time and stop if that pair is found
            distance = nltk.edit_distance(w_id1, w_id2)
            if distance == 1:
                id1, id2 = w_id1, w_id2
                break
        if distance == 1:
            break
    # convert the string to list
    id1_list = list(id1)
    # compare the two strings
    for i, (id_letter1, id_letter2) in enumerate(zip(id1_list, list(id2))):
        if not id_letter1 == id_letter2:
            # and replace the differential character
            id1_list[i] = ''
            return ''.join(id1_list)
    return None
    def location(self):
        """ Gets the location from the news story.

            Inputs include the parts of speech tagged words.
            Output is the phrase containing the location of mishap.
        """
        ktm_location = LocationInformation().all_ktm_locations()
        bkt_location = LocationInformation().all_bkt_locations()
        ltp_location = LocationInformation().all_ltp_locations()
        outside_location = LocationInformation().all_locations()
        all_locations = ktm_location + outside_location + bkt_location + ltp_location

        locations = self.location_extractor()
        print(locations)
        return_location = []
        max_ratio = 0
        max_location = []

        for glocation in locations:
            for location in all_locations:
                dist = nltk.edit_distance(glocation, location)
                ratio = (1 - (dist / len(glocation))) * 100
                max_ratio = max(max_ratio, ratio)
                if max_ratio >= 70:
                    max_location = location
                    if max_ratio == ratio:
                        if max_location in ktm_location:
                            return_location = max_location
                        elif max_location in ltp_location:
                            return_location = max_location
                        elif max_location in bkt_location:
                            return_location = max_location
                        elif max_location in outside_location:
                            return_location = max_location
        return (return_location)
Exemple #25
0
def bigram_edit_distance(code1, code2):
    bigram1 = transform_to_ngram(code1.split(), n=2)
    bigram2 = transform_to_ngram(code2.split(), n=2)
    dist = nltk.edit_distance(bigram1, bigram2)
    if dist == 0:
        print(bigram1, '   ----->  ', bigram2)
    return dist
def combined_caption_score(res):
    """Takes the per caption score and returns a collective score
    for the image. The score is calculated as follows:
        * Total length of all the captions (L)
        * Total number of unique words (W)
        * 
    
    Arguments:
        captions_score {[type]} -- [description]
    """
    final_caption_score = {}
    for img, per_caption_stats in res.items():
        W, L, i = 0, 0, 0
        captions = []  # all the captions for a given image
        for c, (caption, (num_uniq_words,
                          total_len)) in per_caption_stats.items():
            W += num_uniq_words
            L += total_len
            captions.append(caption)
            i += 1
        assert i == 5
        edit_dist = 0
        for i in range(5):
            for j in range(i + 1, 5):
                edit_dist += nltk.edit_distance(captions[i], captions[j])
        final_caption_score[img] = edit_dist
    return final_caption_score
Exemple #27
0
    def calculate(self):
        keywords1 = self.split(self.key1)
        keywords2 = self.split(self.key2)

        similarity_constant = 4

        minimum = 1 / math.e
        maximum = 1

        score_list = []

        for i in keywords1:
            for j in keywords2:
                x = len(i)
                y = len(j)
                distance = nltk.edit_distance(i, j)

                if ((x + y) / similarity_constant) >= distance:
                    raw_score = 2 / (math.e**(x / distance) +
                                     (math.e**(y / distance)))
                    scaled_score = (raw_score - minimum) / (maximum - minimum)
                    score_list.append(scaled_score)
        if score_list:
            return max(score_list)
        else:
            return None
Exemple #28
0
def guess_word(prediction, id_to_labels):
    dictionary = [line.strip() for line in open("dictionary.txt", 'r')]
    similar = []
    with open('similar.txt') as f:
        for line in f:
            temp = [x.strip() for x in line.split(',')]
            similar.append(temp)
    new_prediction = []
    i = 0
    while i < len(prediction):
        if i < len(prediction)-1:
            for line in similar:
                if id_to_labels[prediction[i]] in line:
                    if id_to_labels[prediction[i+1]] in line:
                        new_prediction.append(prediction[i])
                        prediction.remove(prediction[i+1])
                        break
        else:
            new_prediction.append(prediction[i])
        i += 1
    output = ''
    for p in prediction:
        output += id_to_labels[p]
    output = output.lower()
    print(output)
    possibilities = []
    for words in dictionary:
        if len(words) == len(output):
            if nltk.edit_distance(words, output) == 1:
                possibilities.append(words)
    print(possibilities)
def query_list_of_words(target_word, list_of_words, edit_distance=1):
    """
    Checks whether a target word is within editing distance of any one in a set of keywords.

    Inputs: - target_word: A string containing the word we want to search in a list.
            - list_of_words: A python list of words.
            - edit_distance: For larger words, we also check for similar words based on edit_distance.

    Outputs: - new_list_of_words: This is the input list of words minus any found keywords.
             - found_list_of_words: This is the list of words that are within edit distance of the target word.
    """
    # Initialize lists
    new_list_of_words = list()
    found_list_of_words = list()

    append_left_keyword = new_list_of_words.append
    append_found_keyword = found_list_of_words.append

    # Iterate over the list of words
    for word in list_of_words:
        if len(word) > 6:
            effective_edit_distance = edit_distance
        else:
            effective_edit_distance = 0  # No edit distance for small words.
        if abs(len(word)-len(target_word)) <= effective_edit_distance:
            if nltk.edit_distance(word, target_word) <= effective_edit_distance:
                append_found_keyword(word)
            else:
                append_left_keyword(word)
        else:
            append_left_keyword(word)

    return new_list_of_words, found_list_of_words
Exemple #30
0
def canFind(text, item_data):

	word_list = re.split(' ', text)
	item_list = re.split(' ', item_data)

	wl_cnt = len(word_list)
	il_cnt = len(item_list)

	res_list = []

	for i in range(wl_cnt - il_cnt + 1):
		isOK = True
		for j in range(il_cnt):
			if nltk.edit_distance(word_list[i+j].lower(), item_list[j].lower()) > 1:
				isOK = False
				break

		if isOK == True:
			res = word_list[i]
			for j in range(il_cnt - 1):
				res = res + " " + word_list[i + 1 + j]

			res_list.append(res)


	return res_list
Exemple #31
0
def find_nearest(input, datalist): #Uses nltk to find the closest match in a list datalist
    indices = []
    for string in datalist:
        indices.append(nltk.edit_distance(input, string))
    closest_index = indices.index(min(indices))
    closest = datalist[closest_index]
    return closest #Returns closest word from the list
Exemple #32
0
def incomplete_pred(words, n):
    all_succeeding = bgs_freq[(words[n-2])].most_common()
    #print (all_succeeding, file=sys.stderr)
    preds = []
    number=0
    for pred in all_succeeding:
        if pred[0].startswith(words[n-1]):
            appendwithcheck(preds, pred)
            number+=1
        if number==3:
            return preds
    if len(preds)<3:
        med=[]
        for pred in all_succeeding:
            med.append((pred[0], nltk.edit_distance(pred[0],words[n-1], transpositions=True)))
        med.sort(key=lambda x:x[1])
        index=0
        while len(preds)<3:
            print (index, len(med))
            if index<len(med):
                if med[index][1]>0:
                    appendwithcheck(preds, med[index])
                index+=1
            if index>=len(preds):
                return preds

    return preds
Exemple #33
0
def error_rate(string, grammar):
    """
    Calculate the Word Error Rate of a grammar from a string

    :param string: The
    :param grammar:
    :return:
    """
    # TODO implement this
    return nltk.edit_distance(string, grammar)
 def get_teamname(self,teamname):
     try:
         return kp_to_kag[teamname]
     except KeyError:
         try:
             return bd_to_kag[teamname]
         except KeyError:
             name,val = '',100
             for team in self.teams_kaggle.name:
                 newval = edit_distance(team,teamname)
                 if newval < val:
                     name = team
                     val = newval
             return name
Exemple #35
0
    def correct_spelling(self, query):
        """Correct spelling.

        Corrects all spelling errors in `query` with Google's algorithm.

        Args:
            query: Query to correct.
        """
        try:
            wait = ui.WebDriverWait(self._driver, self._timeout)
            self._driver.get(self._url)
            wait.until(
                lambda driver: driver.find_elements_by_xpath(
                    "/html/body/center/form/table/tbody/" "tr/td[2]/span[1]/span/input"
                )
            )
            logger.debug("Request done. Back on page: {}".format(self._driver.current_url))

            # Set waiting handler for AJAX request.
            wait = ui.WebDriverWait(self._driver, self._timeout)
            input_element = self._driver.find_element_by_name("q")
            # Input query into search box.
            input_element.send_keys(query)
            input_element.submit()
            logger.info("Submitting query: {}".format(query))
            wait.until(lambda driver: driver.find_elements_by_xpath("//*[@id='resultStats']"))
            logger.debug("Response loaded. Now on page: {}".format(self._driver.current_url))
        except Exception as t:
            logger.error(t)
            self.__reset_driver()
            return None
        # Get suggestion field.
        field = self._driver.find_elements_by_xpath("//*[@id='_FQd']/div/a")

        if 0 < len(field):
            suggested_text = str(field[0].text)
            logger.debug("Did you mean encountered. Suggested query: {}".format(suggested_text))
        else:
            suggested_text = query
            logger.debug("No suggestion.")

        # Google messed things up.
        if self._max_edit_dist < nltk.edit_distance(suggested_text, query):
            logger.warn(
                "Suggested text beyond edit distance threshold"
                "of {}. Returning original query.".format(self._max_edit_dist)
            )
            suggested_text = query
        logger.info("Checker returns: {}".format(suggested_text))
        return suggested_text
Exemple #36
0
def levenshtein(first, second, transpositions=False):
    """
    Return a similarity ratio of two pieces of text. 0 means the strings are not similar at all,
    1.0 means they're identical. This is the Levenshtein ratio:
      (lensum - ldist) / lensum
    where lensum is the sum of the length of the two strings and ldist is the
    Levenshtein distance (edit distance).
    See https://groups.google.com/forum/#!topic/nltk-users/u94RFDWbGyw
    """
    lensum = len(first) + len(second)
    ldist = nltk.edit_distance(first, second, transpositions=transpositions)

    if lensum == 0:
        return 0

    return (lensum - ldist) / lensum
def simple_word_query(target_word, list_of_words, edit_distance=1):
    found_list_of_words = list()
    append_found_keyword = found_list_of_words.append

    for word in list_of_words:
        if len(word) > 6:
            effective_edit_distance = edit_distance
        else:
            effective_edit_distance = 0  # No edit distance for small words.
        if abs(len(word)-len(target_word)) <= effective_edit_distance:
            if nltk.edit_distance(word, target_word) <= effective_edit_distance:
                append_found_keyword(word)
            else:
                pass
        else:
            pass

    return found_list_of_words
def include_spell_mistake(word, similar_word, score):
    """
    Check if similar word passes some rules to be considered a spelling mistake
    
    Rules:
       1. Similarity score should be greater than a threshold
       2. Length of the word with spelling error should be greater than 3.
       3. spelling mistake must occur at least some N times in the corpus
       4. Must not be a correct English word.
       5. First character of both correct spelling and wrong spelling should be same.
       6. Has edit distance less than 2
    """
    edit_distance_threshold = 1 if len(word) <= 4 else 2
    return (score > fasttext_min_similarity
            and len(similar_word) > 3
            and vocab[similar_word] >= spell_mistake_min_frequency
            and not enchant_us.check(similar_word)
            and word[0] == similar_word[0]
            and nltk.edit_distance(word, similar_word) <= edit_distance_threshold)
Exemple #39
0
    def similarity(self, other):
        """
        Return a similarity ratio of two quotes. 0 means the strings are not similar at all,
        1.0 means they're identical. This is the Levenshtein ratio:

          (lensum - ldist) / lensum

        where lensum is the sum of the length of the two strings and ldist is the
        Levenshtein distance (edit distance).

        See https://groups.google.com/forum/#!topic/nltk-users/u94RFDWbGyw
        """
        lensum = len(self.quote) + len(other.quote)
        ldist = nltk.edit_distance(self.quote, other.quote)

        if lensum == 0:
            return 0

        return (lensum - ldist) / lensum
Exemple #40
0
 def suggest(self,word):
     if word in self._valid:
         return word
     if word in self._invalid:
         return None
     if word in self._suggested:
         return self._suggested[word]
         
     if self._spelling.check(word):
         self._valid.add(word)
         return word
         
     if self._maxdist > 0:    
         suggestions = self._spelling.suggest(word)
         if suggestions and nltk.edit_distance(word, suggestions[0]) <= self._maxdist:
             self._suggested[word] = suggestions[0]
             return suggestions[0]
         else:
             self._invalid.add(word)
             return None
     else:
         self._invalid.add(word)  
def compare_tokenset(l,r):
    if l == r:
        return True
    else:
        return False
    #if the names sound similar, return true
    #if 'Byung' in l:
    #    pdb.set_trace()
    #(l, r) = remove_similar_sounds(l,r)
    length = len(l)
    #if 'VAZIRANIz' in l:
    #    pdb.set_trace()
    for i in range(length):
        try:
            if (len(l[i]) == 1 or len(r[i])==1) and (l[i][0] == r[i][0]):
                continue
        except:
            pdb.set_trace()
        if (len(l[i]) > 1 and len(r[i]) > 1) and (nltk.edit_distance(l[i],r[i]) <= 2) and len(l[i])>4:
            continue
        else:
            return False
    return True
# User-variables begin
textQuery = 'querySequence.txt'
# User-variables end

# Get queries in a list
queryList = []
fd = open(textQuery, 'r')
for line in fd:
	 queryList.append(urllib.unquote(line.strip()))

# What is the Euclidean distance between queries when in order?
orderedDistance = []
for i in range(len(queryList)):
	try:
		orderedDistance.append(nltk.edit_distance(queryList[i],queryList[i+1]))
	except IndexError:
		continue

print 'Ordered average distance ', numpy.average(orderedDistance), 'std deviation', numpy.std(orderedDistance)

# Re-run simulation multiple times
averageRandDist = []
for i in range(100):
	# Set the seed to different values
	numpy.random.seed(i)

	# What is Euclidean distance between queries taken at random? 
	numpy.random.shuffle(queryList)
	shuffledDistance = []
	for i in range(len(queryList)):
#!/usr/bin/python

import nltk, os, numpy

# Compare Levenstein distance and rank changes
# Date: 27 February 2012

previousQuery = ''; previousUserID = ''
previousRank = ''
fd = os.popen("awk -F '\\t' '{if ($2 ~ / / && length($4)) print $1\"\\t\"$2\"\\t\"$4}' user-ct-test-collection-01.txt | sort | uniq")
for line in fd:
	LevDistance = []; queryDist = 0
	lineWords = line.split('\t')[1:-1]
	if (len(previousQuery) and (line.split('\t')[0] == previousUserID)):
		queryDist = nltk.edit_distance(' '.join(lineWords),' '.join(previousQuery))
	else:
		previousUserID = line.split('\t')[0]
	previousQuery = lineWords

	# Has the query change resulted in clicked page rank change?
	if (len(previousRank) and (previousRank != line.split('\t')[-1])):
		print line.split('\t')[0],queryDist,1
	elif (len(previousRank) and (previousRank == line.split('\t')[-1])):
		print line.split('\t')[0],queryDist,0
	previousRank = line.split('\t')[-1]
Exemple #44
0
#coding:utf8

from libjade import *
from nltk import edit_distance
from string import ascii_letters

if __name__=='__main__':
    ss=fread('test.txt').split();
    s=ss[0]
    time_init()
    for i in xrange(1, len(ss)):
    # for i in xrange(1, 10):
        t=ss[i]
        # print edit_distance(s, t)
        edit_distance(s, t)
    print time_gap('finished')
    
    # l=list(ascii_letters)
    # ss=[]
    # for i in xrange(100000):
        # shuffle(l)
        # s=''.join(l)
        # ss.append(s)
    # fwrite('\n'.join(ss), 'test.txt')
    
    
    
    
    
    
    
#!/usr/bin/python

import nltk, os, numpy

# Calc Levenshtein distances
# Date: 24 February 2012
# Author: Evgeniy

previousQuery = ""
fd = open("userID_query", "r")
for line in fd:
    LevDistance = []
    queryDist = 0
    # Exlclude the userID
    lineWords = line.split()[1:]
    for i in range(len(lineWords)):
        try:
            LevDistance.append(nltk.edit_distance(lineWords[i], lineWords[i + 1]))
        except IndexError:
            continue
    if len(previousQuery):
        queryDist = nltk.edit_distance(" ".join(line.split()[1:]), previousQuery)
    else:
        queryDist = 0
    previousQuery = " ".join(line.split()[1:-1])
    # userID, mean, std, min, max, queryDist
    print line.split()[0], numpy.mean(LevDistance), numpy.std(LevDistance), numpy.min(LevDistance), numpy.max(
        LevDistance
    ), queryDist
Exemple #46
0
def preprocess_query(var, doc_idx, metadata, cleanmetadata):
	'''
	This method is used to check for statistical queries like df, freq, tf,
	title, author, biblio, text and similar terms. It then strips off these terms
	from the search query and passes the main query to classify_query.
	Essentially the main function of the project.
	'''
	final_out = {}
	
	if 'df ' in var:
		var = var.replace('df ', '')
		final_out = classify_query(var, doc_idx)
		print "\nDocument Frequency of " + var + " : " + str(len(final_out.keys()))
	
	elif 'freq ' in var:
		var = var.replace('freq ', '')
		final_out = classify_query(var, doc_idx)
		print "\nFrequency of " + var + " : " + str(sum(final_out.values()))
	
	elif 'tf ' in var:
		var = var.replace('tf ', '')
		doc_num = re.findall(r'\d+\s', var)
		var = re.sub(r'\d+\s', "", var)
		doc_num = int(doc_num[0])
		final_out = classify_query(var, doc_idx)
		print "\nTerm Frequency of " + var + " : " + str(final_out[doc_num])
		
	elif 'title ' in var:
		var = var.replace('title ', '')
		doc_num = re.findall(r'\d+', var)
		print doc_num
		var = re.sub(r'\d+', "", var)
		print "\nDocument Title: " + metadata[doc_num[0]][0]

	elif 'author ' in var:
		var = var.replace('author ', '')
		doc_num = re.findall(r'\d+', var)
		var = re.sub(r'\d+', "", var)
		print "\nDocument Author: " + metadata[doc_num[0]][1]

	elif 'bib ' in var:
		var = var.replace('bib ', '')
		doc_num = re.findall(r'\d+', var)
		var = re.sub(r'\d+', "", var)
		print "\nDocument Biblio: " + metadata[doc_num[0]][2]
		
	elif 'doc ' in var:
		var = var.replace('doc ', '')
		doc_num = re.findall(r'\d+', var)
		var = re.sub(r'\d+', "", var)
		print "\nDocument Text: " + metadata[doc_num[0]][3]
		
	elif 'similar ' in var:
		var = var.replace('similar ', '')
		similar_words ={}
		
		for i in metadata.values():
			for k in i[3].split():
				if k.strip(punctuation):
					k = k.strip(punctuation)
				similarity = nltk.edit_distance(k, var)
				if similarity < 3:
					similar_words[k] = similarity
		print "\nWords similar to " + var + ": "
		sort_scores = sorted(similar_words.iteritems(), key=operator.itemgetter(1))
		unique_similar = set()
		for (i,j) in sort_scores:
			unique_similar.add(i)
		print unique_similar
		print len(unique_similar)
	else:
# 		print "No stat query"
# 		print "var"
		final_out = classify_query(var, doc_idx)
	
	
	if final_out != {}:
		print_format(var, final_out, index_data, metadata, cleanmetadata)
	
	return final_out
Exemple #47
0
def is_similar_to(str1, str2):
	avg_len = (len(str1) + len(str2)) / 2
	return nltk.edit_distance(str1, str2) <= avg_len / 5
        # Tokenize the text line
	tokens = nltk.word_tokenize(line.strip())
	# Ignore empty line
	if (not len(tokens)): continue
	
	# Remove stopwords from tokens
        tokens = [w for w in tokens if w.lower() not in stopwords]
	# Remove any non-letters from tokens
	tokens = [re.sub('\W+','',n) for n in tokens]
	# Remove empty and non-words from tokens
	tokensRefined = []
	for t in tokens:
		if (len(t.strip()) == 0):
			continue
		elif (len(t.strip()) <= 3):
			continue
		else:
			tokensRefined.append(t)
	# Ignore empty tokensRefined
	if (not len(tokensRefined)): continue

	# Stem the words
        stems = [porter.stem(t) for t in tokensRefined]
	# Print token, difference with stem and position in text
	for i in range(len(tokensRefined)):
		globalTextPosition += 1
		editDistance = nltk.edit_distance(tokensRefined[i], stems[i])
		print tokensRefined[i], editDistance, globalTextPosition


Exemple #49
0
import edit_distance2
import nltk
from string import ascii_letters

if __name__=='__main__':
    l=list(ascii_letters[:10])
    s=''
    for i in xrange(200):
        shuffle(l)
        s+=''.join(l)
    t=''
    for i in xrange(100):
        shuffle(l)
        t+=''.join(l)
    
    n=20
    print len(s), len(t)
    
    time_init()
    for i in xrange(n):
        b=edit_distance.edit_distance(s, t)
    print time_gap('edit_distance.edit_distance')
    
    for i in xrange(n):
        a=edit_distance2.edit_distance(s, t)
    print time_gap('edit_distance2.edit_distance')
    
    for i in xrange(n):
        a=nltk.edit_distance(s, t)
    print time_gap('nltk.edit_distance')
    
Exemple #50
0
def get(folder, host, user, password, database, incremental_ind):

    def id_generator(size=25, chars=string.ascii_lowercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    punctuation = "( + ) [ ? : ! . ; ] * # % ` ' / _ = -".split()
    punctuation.append('"')

    ###SETUP MAJOR VARS

    fdmain = folder+ "/location_disambiguation/"
    #need to figure out what this is

    #separate first(0) and incremental(1) disambiguations
    incremental = incremental_ind

    # Step 1
    mydb = MySQLdb.connect(host,
    user,
    password,
    database)
    cursor = mydb.cursor()

    if incremental == 0:
        increm = ''
    else:
        increm = ' AND (location_id is NULL or location_id = "")'   

    print "Step 1..."

    cursor.execute('select distinct country_transformed from rawlocation where country_transformed is not NULL and country_transformed != "" and country_transformed!="s" and country_transformed!="B." and country_transformed!="omitted" '+increm)


    countries = [item[0] for item in cursor.fetchall() if item[0] is not None]
    print countries

    
    os.makedirs(fdmain)
    os.makedirs(fdmain+'uspto_disamb/')
    os.makedirs(fdmain+'uspto_disamb_counts/')
    os.makedirs(fdmain+'uspto_disamb_v2/')
    os.makedirs(fdmain+'uspto_disamb_loc_latlong/')
    os.makedirs(fdmain+'uspto_disamb_only_loc/')


    for c in countries: 
        print c
        datum = {}
        output = open(fdmain+'uspto_disamb/'+c+'.tsv','wb')
        output2 = open(fdmain+'uspto_disamb_counts/'+c+'.tsv','wb')
        outp = csv.writer(output,delimiter='\t')

        outp2 = csv.writer(output2,delimiter='\t')
        cursor.execute("select city,state,country_transformed,count(city) from rawlocation where country_transformed = '"+c+"'"+increm+"  group by city,state order by count(city) desc")
        outp2.writerows(cursor.fetchall())
        cursor.execute('select distinct state from rawlocation where country_transformed = "'+c+'"'+increm)
        states = [f[0] for f in cursor.fetchall()]
        for s in states:
            if str(s) == 'None' or str(s)=='NULL':
                cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and (state is NULL or state="NULL")'+increm)
                s = ''
            else:
                s = re.sub('[\n\t\f\r]+','',s.strip())
                cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and state ="'+s+'"'+increm)
            locs = [list(f) for f in cursor.fetchall()]
            for l in locs:
                ll = []
                for l1 in l:
                    if l1:
                        ll.append(re.sub('[\n\t\r\f]+','',l1.strip()))
                    else:
                        ll.append('')
                outp.writerow(ll+[s,c])
        output.close()
        output2.close()
        
    print "Step 2..."
    fd = fdmain+'uspto_disamb_counts/'
    diri = os.listdir(fd)

    mastdata = {}
    mastdatum = {}
    for d in diri:
        #this is separate from the forloop below because otherwise places that are in the wrong file break it
        mastdata[d.replace('.tsv','')] = {}
        mastdatum[d.replace('.tsv','')] = {}
    for d in diri:
        input = open(fd+d,'rb')
        inp = csv.reader(input,delimiter='\t')
        try:
            head = inp.next()
            top = int(head[-1])
        except:
            pass
        num = 1
        for i in inp:
            num+=1
        inp = csv.reader(file(fd+d),delimiter='\t')
        for e,i in enumerate(inp):
            if e<=int(num/3) and int(i[-1])>int(top/5):
                city = unidecode(i[0])
                for p in punctuation:
                    city = city.replace(p,'')
                city = re.sub('[0-9]+','',city)
                city = re.sub('^\s+','',city)
                city = re.sub('\s+$','',city)
                city = city.replace(' ','')
                state = i[1]
                state = re.sub('^\s+','',state)
                state = re.sub('\s+$','',state)
                country = i[2]
                key = id_generator(size=12)
                try:
                    gg = mastdata[country][city.lower()+'_'+state.lower()]
                except:
                    #print len(mastdata[country])
                    mastdata[country][city.lower()+'_'+state.lower()] = [key,i[0].strip(),i[1].strip(),i[2],int(i[3])]
                    mastdatum[country][city.lower()] = [key,i[0],i[1].strip(),i[2].strip(),int(i[3])]

        input.close()

    print "Step 3..."
    # Step 3
    fd = fdmain+'uspto_disamb/'
    diri = os.listdir(fd)
    for d in diri:
        output = open(fdmain+'uspto_disamb_v2/'+d,'wb')
        input = open(fd+d,'rb')
        outp = csv.writer(output,delimiter='\t')
        inp = csv.reader(input,delimiter='\t')
        data = mastdata[d.replace('.tsv','')]
        datum = mastdatum[d.replace(".tsv",'')]
        secdata = {}
        secdatum = {}
        for i in inp:
            city = unidecode(i[1])
            state = i[2]
            country = i[3]
            for p in punctuation:
                city = city.replace(p,'')
            city = re.sub('[0-9]+','',city)
            city = re.sub('^\s+','',city)
            city = re.sub('\s+$','',city)
            origcity = city
            city = city.replace(' ','')
                
            try:
                gg = data[city.lower()+'_'+state.lower()]
                outp.writerow(i+gg)
            except:
                try:
                    cit = city.lower().split(",")[0]
                    gg = data[cit.lower()+'_'+state.lower()]
                    
                    outp.writerow(i+gg)
                except:
                    try:
                        cit = city.lower().split("/")
                        for cc in cit:
                            gg = data[cc.lower()+'_'+state.lower()]
                            outp.writerow(i+gg)
                            break
                    except:
                        try:
                            cit = city.lower().split("-")
                            for cc in cit:
                                gg = data[cc.lower()+'_'+state.lower()]
                                outp.writerow(i+gg)
                                break
                        except:
                            try:
                                cit = city.lower().split("&")[0]
                                gg = data[cit.lower()+'_'+state.lower()]
                            
                                outp.writerow(i+gg)
                            except:
                                try:
                                    gg = datum[city.lower()]
                        
                                    outp.writerow(i+gg)
                                except:
                                    try:
                                        
                                        howdy = 0
                                        
                                        for k,v in data.items():
                                                dist = jaro.jaro_winkler_metric((city.lower()+'_'+state.lower()).decode('utf-8','ignore'),k.decode('utf-8','ignore'))
                                                edit = nltk.edit_distance(city.lower()+'_'+state.lower(),k)
                                                if (re.search(k.split("_")[0],city.lower()) and k.split("_")[0]!='') or dist >= 0.95 or (edit==2 and len(city.lower())>5):
                                                    outp.writerow(i+v)
                                                    howdy = 1
                                                    break
                                            
                                        gg = datum[city]
                                    except:
                                        if howdy == 0:
                                            cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4]
                                            
                                            howdy2 = 0
                                            for cc in cit:
                                                try:
                                                    gg = datum[cc]
                                                    
                                                    outp.writerow(i+gg)
                                                    howdy2 = 1
                                                    break
                                                except:
                                                    pass
                                            
                                            if howdy2 == 0:
                                                try:
                                                    gg = secdata[city.lower()+'_'+state.lower()]
                                                    outp.writerow(i+gg)
                                                except:
                                                    try:
                                                        cit = city.lower().split(",")[0]
                                                        gg = secdata[cit.lower()+'_'+state.lower()]
                                                        outp.writerow(i+gg)
                                                    except:
                                                        try:
                                                            cit = city.lower().split("&")[0]
                                                            gg = secdata[cit.lower()+'_'+state.lower()]
                                                            outp.writerow(i+gg)
                                                        except:
                                                            try:
                                                                gg = secdatum[city.lower()]
                                                                outp.writerow(i+gg)
                                                            except:
                                                                try:
                                                                    howdy = 0
                                                                    gg = datum[city]
                                                                except:
                                                                    if howdy == 0:
                                                                        cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4]
                                                                        howdy2 = 0
                                                                        for cc in cit:
                                                                            try:
                                                                                gg = secdatum[cc]
                                                                                outp.writerow(i+gg)
                                                                                howdy2 = 1
                                                                                break
                                                                            except:
                                                                                pass
                                                                        if howdy2 == 0:
                                                                            key = id_generator(size=12)
                                                                            secdata[city.lower()+'_'+state.lower()] = [key,i[1],i[2],i[3]]
                                                                            secdatum[city.lower()] = [key,i[1],i[2],i[3]]
                                                                            outp.writerow(i+[key,i[1],i[2],i[3]])
        input.close()
        output.close()
                                    
    print "Step 4..."
    #Step 4
    fd = fdmain+'uspto_disamb_v2/'
    fd3 = fdmain+'uspto_disamb_only_loc/'
    diri = os.listdir(fd)

    for d in diri:
        input = open(fd+d,'rb')
        output = open(fd3+d,'wb')
        inp = csv.reader(input,delimiter='\t')
        outp2 = csv.writer(output,delimiter='\t')
        data = {}
        final = {}
        disamb = {}
        for i in inp:
            try:
                gg = data[' '.join(i[5:])]
                final[i[0]] = i[:4]+[gg]+i[5:]
            except:
                try:
                    data[' '.join(i[5:])] = i[4]
                    final[i[0]] = i
                    disamb[i[4]] = i[4:]
                except:
                    print d,i
        input.close()
        for k,v in disamb.items():
            if len(v) == 5:
                v = v[:-1]
            outp2.writerow(v)
        output.close()

    #exit()
    print "Done Step 1 - 4"
#!/usr/bin/python

import os, nltk
from nltk.corpus import stopwords

porter = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
fd = os.popen("awk -F '\t' '{if ($2 ~ / /) print $2}' user-ct-test-collection-01.txt | sort | uniq")
for line in fd:
	# Stem the queries
	tokens = nltk.word_tokenize(line)
	tokens = [w for w in tokens if w.lower() not in stopwords]
        indexWords = [porter.stem(t) for t in tokens]
	# How many chars are extra in the queries
	charSaving = nltk.edit_distance(' '.join(indexWords),line)
	print len(line), charSaving
                                i = i+1
                                t_pos_num=t_pos_num+1
                                if i == len(phrase):
                                    tf = tf+1

            print tf
            exit(0)

    elif split_words[0]=="similar":
        term = split_words[1]
        sim_dic ={}
        length = len(term)
        for i in range(1,len(uncategorized_wordlist)):
            for word in uncategorized_wordlist[i]:
                if word not in sim_dic:
                    sim = nltk.edit_distance(term,word)
                    if sim!=0 and sim<length/2:
                        print word
                        sim_dic[word] = sim
                
        exit(0)
                    
            
            


    phrase_list,not_list,not_phrase_list,plain_list= query_extraction(query)

    #print phrase_list
    #print not_list
    #print not_phrase_list
Exemple #53
0
def build_dataset(data_set, home_dir, dir, unique_spam, spam_top_50):
    """Build dataset
    Data set should have the fields
    1.) IP Address from the received field in the header
    2.) Matching degree of domain names between Message-Id and Received/From
    3.) Subject 
    4.) Name from the From field 
    5.) Content type
    6.) Attachments: none, text, or non-text 
    7.) Number of URLs present 
    8.) URL ratio 
    9.) SPAM word ratio 
    10.) SPAM degree as by equation in paper 
    11.) Classification label: Spam or Ham """
    
    file_list = os.listdir(home_dir + dir)
    count = len(file_list)
    for file_name in file_list:
        data_set[file_name] = DataMember()
        #Ignore files that start with .
        if file_name[0] == '.' or os.path.isdir(home_dir + dir + '/' + file_name) or \
            os.path.islink(home_dir + dir + '/' + file_name):
            continue 
        #print file_name    
        file = open(home_dir + dir + '/' + file_name)
        mail = email.message_from_file(file)
        file.close()
        
        #Extract information from header
        for key in mail.keys():
            #1.) IP Address from the received field in the header (Easy just read it)
            #Get the IP address of the last Received from field unless its 127.0.0.1
            if key == 'Received':
                address = re.search('(\d{1,3}\.){3}\d{1,3}',mail[key]).group()
                if address != '127.0.0.1':
                    data_set[file_name].ip_address_str += address + ' '
                        
            #3.) Subject (Easy just read it)          
            if key == 'Subject':
                data_set[file_name].subject_str = repr(mail[key])[1:-1]
                
            #4.) Name from the From field (Easy just read it)
            if key == 'From':
                data_set[file_name].from_name_str = repr(mail[key])[1:-1]
        
        #2.) Matching degree of domain names between Message-Id and (Received/From ??) field (Easy just read and compare)
        if mail['From'] != None:
            from_domain = re.search('@[\[\]\w+\.]+', mail['From'])
        else:
            from_domain = None;
        if str(from_domain) != 'None':
            from_domain = from_domain.group()[1:]
        else:
            #Non-ascii domain name, pull out the hex encoding
            from_domain = repr(mail['From']).replace('\\x','')
            if from_domain.find('@') == -1:
                from_domain = ' '
            else:
                from_domain = re.search('@[\[\]\w+\.]+', from_domain).group()[1:]
        message_domain = re.search('@[\[\]\w+\.]+',mail['Message-ID'])
        if str(message_domain) != 'None':
            message_domain = message_domain.group()[1:]
        else:
            #Non-ascii domain name, pull out the hex encoding
            message_domain = repr(mail['Message-ID']).replace('\\x','')
            message_domain = repr(mail['Message-ID']).replace('%','')
            if message_domain.find('@') == -1:
                message_domain = ' '
            else:
                message_domain = re.search('@[\[\]\w+\.]+', message_domain).group()[1:]
                
        distance = nltk.edit_distance(from_domain, message_domain)
        domain_len = max(len(from_domain), len(message_domain), 1) * 1.0
        
        data_set[file_name].degree_domains_match = 1.0 - distance / domain_len
                
        #Get the length of the message and the text
        length = (get_message_len(mail) * 1.0)
        body = get_message_body(mail)

        #5.) Content type (Easy just read it)
        data_set[file_name].type_HTML = get_type_content(mail)
        #6.) Attachments: none, text, or non-text 
        data_set[file_name].attachments = get_type_attachments(mail)
        #7.) Number of URLs present 
        urls = re.findall( \
                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', \
                    body)
        data_set[file_name].num_urls = len(urls)
        
        #8.) URL ratio (% of message body that is URLs)
        data_set[file_name].percent_urls = len(''.join(urls)) / length
        
        #9.) SPAM word ratio 
        #10.) SPAM degree as by equation in paper 
        spam_count = 0
        w1 = 50 / 51.0
        w2 = 1 / 51.0
        freq_spam = 0.0
        s1 = 0.0
        s2 = 0
        
        body = nltk.clean_html(body)
        words = nltk.word_tokenize(body)
        word_count = max(1, len(words)) #Don't allow divide by zero
        for word in nltk.word_tokenize(body):
            if word in unique_spam:
                #Must be SPAM
                s2 = 1
                spam_count += 1
            elif word in spam_top_50:
                freq_spam += 1.0
                spam_count += 1
                        
        s1 = freq_spam / word_count
        
        data_set[file_name].percent_spam = spam_count / length
        data_set[file_name].degree_spam = w1 * s1 + w2 * s2
        
        #11.) Classification label: Spam or Ham
        if file_name.startswith('ham'):
            data_set[file_name].spam = 1
        else:
            data_set[file_name].spam = 2
        #Fields that need to be md5 encoded are: IP address, Subject, and from        
        ip_address_md5 = hashlib.md5()
        ip_address_md5.update(data_set[file_name].ip_address_str)
        data_set[file_name].ip_address = int(ip_address_md5.hexdigest(),16)
        
        subject_md5 = hashlib.md5()
        subject_md5.update(data_set[file_name].subject_str)
        data_set[file_name].subject = int(subject_md5.hexdigest(),16) 
        
        from_name_md5 = hashlib.md5()
        from_name_md5.update(data_set[file_name].from_name_str)
        data_set[file_name].from_name = int(from_name_md5.hexdigest(),16) 

    #for key in data_set.keys():
    #    print data_set[key]
    return data_set
print 'Averega query length:', len(queryWords)/float(len(queryList))

# Get unique query words
uniqueQueryWords = list(set(queryWords))
print 'Unique queries:', len(list(set(queryList)))
print 'Unique words in queries:', len(uniqueQueryWords)

# What is the percent of repetitions?
print 'Percent of query reuse:', 100 - len(list(set(queryList)))/float(len(queryList))*100
print 'Percent of word reuse:', 100 - len(uniqueQueryWords)/float(len(queryWords))*100

# Extracting sessions of queries
orderedDistance = []
for i in range(len(queryList)):
	try:
		orderedDistance.append(nltk.edit_distance(queryList[i],queryList[i+1]))
	except IndexError:
		continue

# The average distance serves for separating 'query sessions'
sessionSearched = defaultdict(list)
sessionID = 1
averageQueryDistance = numpy.average(orderedDistance)
for i in range(len(queryList)):
	try:
		# Queries that are below the average distnace belong the same session
		# NOTE: due to implementation each session contains only unique queries duplicate input queries are thus lost
		if (nltk.edit_distance(queryList[i],queryList[i+1]) <= averageQueryDistance):		
			# Do no duplicate adding same query
			try:
				sessionSearched[sessionID].index(queryList[i])
Exemple #55
0
    def correct(self,word):
        candidates = self.known([word]) or self.known(self.edits1(word)) or self.knownEdits2(word) or [word]
        sugg=list(candidates)
        sugg.sort(key = lambda s: nltk.edit_distance(word,s))

        return sugg[:min(len(sugg),10)]