Beispiel #1
0
def strip_bogus_lines(arg_lines):
    re_blank = re.compile(r'^\s$')
    re_page_left = re.compile(r'^\s*Page [0-9]+')
    re_page_right = re.compile(r'Page [0-9]+\s*$')
    berk_string = 'BERKELEY TRAINING ASSOCIATES © 2009\n'
    mft_string = 'MFT PRACTICE EXAMINATIONS'

    lines = []
    for line in arg_lines:
        bogosity = 0.0
        if re_blank.search(line):
            bogosity += 1.0
        if re_page_left.search(line):
            bogosity += 0.5
        if re_page_right.search(line):
            bogosity += 0.5
        l = Levenshtein.distance(line[-(len(mft_string)):], mft_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        l = Levenshtein.distance(line[:len(mft_string)], mft_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        l = Levenshtein.distance(line, berk_string)
        if l < 5:
            bogosity += (5 - l) / 5.0
        if bogosity < 0.25:
            lines.append(line)
    return lines
Beispiel #2
0
def write_lex_stats(b, num, syll = None):
    """Use Levenshtein package to calcualte lev and count up mps, neighbors, etc"""
    total = 0.
    mps = 0
    neighbors = 0
    homophones = 0
    lev_total = 0
    for item in itertools.combinations(b, 2):
    	if syll != None:
	    	#if len(item[0].split("-"))==syll or len(item[1].split("-"))==syll:
		    lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1]))
		    if lev == 0: homophones += 1
		    elif lev == 1:
		        neighbors += 1
		        if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1
		    total += 1
		    lev_total += lev
        else:
	   		lev = Levenshtein.distance(re.sub("-", "", item[0]), re.sub("-", "", item[1]))
	   		if lev == 0: homophones += 1
	   		elif lev == 1:
	   			neighbors += 1
	   			if len(re.sub("-", "", item[0])) == len(re.sub("-", "", item[1])): mps += 1
   			total += 1
   			lev_total += lev
    print str(num)
    f.write(",".join([str(x) for x in [num, homophones, mps, neighbors, lev_total/total, len(b)] ]) + "\n")
    return
    def search(self, keywords, cutoff=0.3):
        """Search through all songs in self.songs.
        Determines all songs being matched by the supplied keywords.
        Returns a list of tuples of the form (song, matchratio), where matchratio goes from <cutoff> to 1.0;
        1.0 being a perfect match. The result is sorted by that value, highest match ratios first."""

        num_keywords = len(keywords)
        results = []
        for song in self.songs.values():
            # search in title and gametitle
            haystack1 = set(song.title.lower().split())
            haystack2 = set(song.game.title.lower().split())
            ratio = 0
            for keyword in keywords:
                keyword = keyword.lower()
                # determine best keyword match
                subratio1 = max(Levenshtein.ratio(keyword, word) for word in haystack1)
                subratio2 = max(Levenshtein.ratio(keyword, word) for word in haystack2)
                subratio = max(subratio1,subratio2*0.8)
                if subratio > 0.7:
                    # assume low ratios are no match
                    ratio += subratio
            ratio /= num_keywords
            
            if ratio > cutoff:
                # random cutoff value
                results.append((song, ratio))
            
        return sorted(results, key=lambda s: s[1], reverse=True)
Beispiel #4
0
def predictionRatio(df, metric="Levenshtein"):
    #Generate all possible combinations for string matching
    soc_media_1, soc_media_2 = df.columns
    # Convert everything to lower case
    df[soc_media_1] = df[soc_media_1].str.lower()
    df[soc_media_2] = df[soc_media_2].str.lower()

    df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_known_list = df_known.applymap(lambda x: list([x]))
    df_search_list = df_search.applymap(lambda x: list([x]))
    df_search_list = df_known_list+df_search_list.T

    # Find the indices of columns for each row  based on metric
    # For Levenshtein get the min., for JaroWinkler get the max.
    if metric == 'Levenshtein':
        search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1]))
        indices = search_res.idxmin(axis=1)
    else:
        search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1]))
        indices = search_res.idxmax(axis=1)
    
    # Get the matches for social media account
    match = df[soc_media_2].ix[indices]
    df_t = DataFrame()
    df_t['actual'] = df[soc_media_2].reset_index(drop=True)
    df_t['match'] = match.reset_index(drop=True)
    # Find the ratio of correct matches
    match_count = (df_t.actual == df_t.match).value_counts()
    ratio = float(match_count[True]) / (match_count[True] + match_count[False])
    return ratio
    def compare_list(self, company_name, table_name):
        iac_hz_list = [
            [0, 1, 2, 3, 4, 5, 6]
        ]
        iac_guanwang_list = [
            [1, 1, 2, 3, 4, 5, 6]
        ]
        hz_list = [x.replace("\n", "").replace("\r", "") for x in iac_hz_list]
        guanwang_list = [x.replace("\n", "").replace("\r", "") for x in iac_guanwang_list]

        if iac_hz_list is None:
            print "haizhi system lack %s's %s." % (company_name, table_name)
            return
        if iac_guanwang_list is None:
            print "guanwang lack %s's %s." % (company_name, table_name)
            return

        if len(iac_hz_list) == len(iac_guanwang_list):
            print "the length of list %s is same : d%" % (table_name, len(iac_hz_list))
        else:
            print "the length of list %s is different : d%,d%" % (table_name, len(iac_hz_list), len(iac_guanwang_list))

        # 相同的元素数量
        num = 0
        for row in hz_list:
            if row in guanwang_list:
                # guanwang_list.remove(row)
                # hz_list.remove(row)
                num += 1
                continue
            else:
                for row2 in guanwang_list:
                    Levenshtein.distance(row, row2)
Beispiel #6
0
def check_clusters(seq_list, all_clusters, cutoff, unique):
    print('Checking for unique input sequences')
    seqs = {}
    for seq in seq_list:
        seqs[seq[0]] = seqs.get(seq, 0) + 1

    # Sequences in seq_list are unique
    if unique:            
        for k, v in seqs.items():
            if v != 1:
                print('Error: sequence %s appears in seq_list more than once.' % k)
            
    # There is a one-to-one correspondence between sequences in seq_list and sequences in all_clusters
    print('Checking for one-to-one correspondence between input and output sequences.')
    for cluster, max_len, min_len in all_clusters:
        for seq, id in cluster:
            if seq not in seqs:
                print('Error: sequence %s is in all_clusters but not in seq_list.' % seq)
            else:
                seqs[seq] += 1

    for k, v in seqs.items():
        if v < 2:
            print('Error: sequence %s appears in seq_list but not in all_clusters.' % k)
        elif unique and v > 2:
            print('Error: sequence %s appears in all_clusters more than once.' % k)
            
    # The cluster forms a connected network with each sequence in a cluster having a nearest neighbour within the cutoff distance
    print('Checking cluster membership.')
    t0 = time.time()
    i = 0
    for cluster, max_len, min_len in all_clusters:
        # push each cluster through get_clusters and check it results in a single cluster
        # one could argue that this isn't strictly an independent check, but the underlying algorithm is in scipy
        # this does check that merging across chunks has happened correctly
        if len(cluster) > 1:
            res = get_clusters(cluster, cutoff)
            if len(res) != 1:
                print('Error: cluster with sequence %s (id %s) is partitioned into %d clusters by further application of get_cluster.' % (cluster[0], len(res)))

        i += 1
        t1 = time.time()
        if t1 - t0 > 10:
            print 'Checking cluster %d\n' % i
            t0 = time.time()

    # No clusters are mergeable
    print('Checking that clusters are distinct.')
    for i in range(len(all_clusters)):
        c1 = all_clusters[i][0]
        for c2, max_len, min_len in all_clusters[i+1:]:
            for s1, i1 in c1:
                for s2, i2 in c2:
                    cut = int(cutoff * min(len(s1), len(s2)))
                    if hamming:
                        if len(s1) == len(s2) and ld.hamming(s1, s2) <= cut:
                            print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
                    else:
                        if ld.distance(s1, s2, cut) <= cut:
                            print('Error: sequences %s (id %s) and %s (id %s) are in different clusters but are within the cutoff distance.' % (s1, i1, s2, i2))
Beispiel #7
0
def byLevenshtein(key, result_yield):
    lang = "zha"
    try:
        str(key).encode('iso-8859-1')
    except UnicodeEncodeError:
        lang = "zh"
    result_list2d = []
    if lang == "zha":
        for i in result_yield:
            result_list2d.append([Levenshtein.distance(key, i[0]), i])
    else:
        for i in result_yield:
            for j in i[1]:
                list_tmp = split("[\[\]\(\)\ \;\,\。\,\.]", j)
                list_distance = []
                if len(list_tmp) == 2:  # 2 means this entry contains only 1 word
                    distance = Levenshtein.distance(key, list_tmp[1])
                    if distance == 0:
                        list_distance.append(-1)  # -1 means the best matched one
                    else:
                        list_distance.append(distance)
                        continue
                for tmp in list_tmp:
                    if key in tmp:
                        list_distance.append(Levenshtein.distance(key, tmp))
                result_list2d.append([min(list_distance), i])
                # The method above is not so accurate,but it might work better than the previous one
    result_list2d.sort()
    for i in result_list2d:
        yield i[1]
    def similarTerms(self,target):
        the_same=[]
        counter=0
        with codecs.open(termfile,'rb',encoding='utf-8')as tf:
            list_of_t=tf.readlines()
            for item in list_of_t:
              item=item.strip('\n')
              if  item!=target:
                if self.if_compoTerm(target):

                        List_target=self.splitTerms(target)
                        for t in List_target:

                            if item.find(t)!=-1:
                                 if item not in the_same:
                                     dist=Levenshtein.distance(item,target)
                                     print("the dist:",dist)
                                     if item!=target:
                                        the_same.append(item)

                            if Levenshtein.ratio(t,item)==0.8:
                                if item not in the_same:
                                    if re.fullmatch(item,target):
                                        the_same.append(item)

                #print("the ratio is ",the_ratio)

        #print("is",the_same)
        return the_same
Beispiel #9
0
def splitted_word_distance( pattern, text ):
    #remove trailing chars...
    words = text.split()
    lp = len(pattern)
    lw = len(words)
    if lp < lw:
        d = len(''.join(words[lp:]))
        n = lp
    else: 
        if re_type in map( lambda e: type(e), pattern[lw:] ):
            return float("inf")
        d = len(''.join(   pattern[lw:]   ))
        n = lw
    d0 = [ d ]
    for i in range(n):
        word = words[i]
        p = pattern[i]
        if type( p ) == str:
            d += levenshtein.distance( p, word )
            d0.append( levenshtein.distance( p, word ) )
        elif type( p ) == re_type:
            if not p.fullmatch( word ):
                return float("inf")
        else:
            raise Exception( "Pattern has wrong type %s" % (str(type(pattern))) )
    return d
Beispiel #10
0
def identify_anchor_kmer_in_reference_graph(reference_graph, kmer_to_anchor, leftmost=None, rightmost=None, path_length=None):
	"""

	:type reference_graph: nx.DiGraph
	"""
	toposort = {v: k for k, v in enumerate(nx.topological_sort(reference_graph))}
	# print "Righmost is ",rightmost,toposort[rightmost]
	nodes_to_consider = reference_graph.nodes()
	if rightmost:
		idx = toposort[rightmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] <= idx, nodes_to_consider)
	# print "Max is ", idx
	if leftmost:
		idx = toposort[leftmost]
		nodes_to_consider = ifilter(lambda x: toposort[x] >= idx, nodes_to_consider)
	# print "Min is ", idx
	nodes_to_consider = list(nodes_to_consider)

	node_dists = [(node, Levenshtein.distance(node, kmer_to_anchor), Levenshtein.editops(node, kmer_to_anchor)) for node in
				  nodes_to_consider]
	# print "Will search anchor in ",list(node_dists)
	min_dist = min(node_dists, key=itemgetter(1))[1]
	node_dists = [x for x in node_dists if x[1] == min_dist]
	print "Min possible dist is", min_dist
	if rightmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[rightmost] - path_length))
	elif leftmost:
		score_func = lambda x: (x[1] - min_dist) + abs(toposort[x[0]] - (toposort[leftmost] + path_length))
	dist_sorted = sorted(node_dists, key=score_func)
	# identify the rightmost node with minimal distance
	return dist_sorted[0][0]
    def get_closest_match(self,
                          cells,
                          matching_threshold,
                          suppress_non_answer_cells=False):
        """ Returns a list of cells that most closely match
            the question prompt.  If no match is better than
            the matching_threshold, the empty list will be
            returned. """
        return_value = []
        distances = [Levenshtein.distance(self.start_md, u''.join(cell['source']))
                     for cell in cells]
        if min(distances) > matching_threshold:
            return return_value

        best_match = argmin(distances)
        if self.stop_md == u"next_cell":
            end_offset = 2
        elif len(self.stop_md) == 0:
            end_offset = len(cells) - best_match
        else:
            distances = [Levenshtein.distance(self.stop_md, u''.join(cell['source']))
                         for cell in cells[best_match:]]
            if min(distances) > matching_threshold:
                return return_value
            end_offset = argmin(distances)
        if len(self.question_heading) != 0 and not suppress_non_answer_cells:
            return_value.append(NotebookExtractor.markdown_heading_cell(self.question_heading, 2))
        if not suppress_non_answer_cells:
            return_value.append(cells[best_match])
        return_value.extend(cells[best_match + 1:best_match + end_offset])
        return return_value
Beispiel #12
0
    def _match_user_agent(cls, user_agent):
        device = cls.objects.filter(user_agent=user_agent).order_by("-actual_device_root")[:1]

        if len(device):
            return device[0]
        else:
            if settings.UA_PREFIX_MATCHING:
                # Try more flexible matching, 1 third of the UA string
                ds_user_agent = user_agent[: len(user_agent) // 3]
                devices = cls.objects.filter(user_agent__startswith=ds_user_agent)
                devices = devices.order_by("-actual_device_root")[: settings.UA_PREFIX_MATCHING_LIMIT]

                if len(devices):
                    user_agent = force_unicode(user_agent)
                    best = reduce(
                        lambda x, y: Levenshtein.distance(user_agent, x.user_agent)
                        < Levenshtein.distance(user_agent, y.user_agent)
                        and x
                        or y,
                        devices,
                    )

                    if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE:
                        return best

            if settings.UA_GENERIC_FALLBACK:
                # Try to match with generic properties
                # :TODO:
                raise NotImplemented

        raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
Beispiel #13
0
def findMatchScore(searchName, foundName) :
	if(type(searchName) is unicode):
		searchName = unicodedata.normalize('NFKD', searchName).encode('ascii','ignore')
	if(type(foundName) is unicode):
		foundName = unicodedata.normalize('NFKD', foundName).encode('ascii','ignore')
	bigR = 0
	inputWords = searchName.replace(':', ' ').split(' ')
	foundWords = foundName.replace(':', ' ').split(' ')
	inputWords = removeSkipWords(inputWords)
	foundWords = removeSkipWords(foundWords)
	for iWord in inputWords:
		maxRatio = 0
		for fWord in foundWords:
			r = Levenshtein.ratio(iWord.lower().replace("/'s", ''), fWord.lower().replace("/'s", ''))
			if r > maxRatio:
				maxRatio = r
		bigR += maxRatio
	bigR2 = 0 # if the input has MORE words than the solution (rare)
	for fWord in foundWords:
		maxRatio = 0
		for iWord in inputWords:
			r = Levenshtein.ratio(iWord.lower().replace("/'s", ''), fWord.lower().replace("/'s", ''))
			if r > maxRatio:
				maxRatio = r
		bigR2 += maxRatio
	bigR /= len(inputWords)
	bigR2 /= len(foundWords)
	return max(bigR, bigR2)
Beispiel #14
0
                def find_by_distance():
                    word_nl = getNormalWord(word).lower()
                    min_dist = 10
                    min_dist_word_idx = wordIdx
                    import Levenshtein
                    for i in range(0, self.mTotalEntries):
                        word_ = getNormalWord(self.getWord(i)).lower()
                        dist = Levenshtein.distance(word_nl, word_)
                        if dist == 1:
                            return i
                        if dist < min_dist:
                            min_dist = dist
                            min_dist_word_idx = i
                    for i in range(0, self.derived_dict.mTotalEntries):
                        word_ = getNormalWord(self.derived_dict.getWord(i)).lower()
                        dist = Levenshtein.distance(word_nl, word_)
                        if dist == 1:
                            words = self.derived_dict.getExplanations(word_)
                            return self.getWordIdxInternal(words[0])
                        if dist < min_dist:
                            min_dist = dist
                            words = self.derived_dict.getExplanations(word_)
                            min_dist_word_idx = self.getWordIdxInternal(words[0])

                    return min_dist_word_idx
Beispiel #15
0
def read_type(left_read, right_read, left_enzsite, right_enzsite, left_bc, right_bc):
    """Determine if bisulfite read is watson or crick"""
    lr_enz_left = left_read[1][len(left_bc):len(left_bc)+5]
    rr_enz_right = right_read[1][len(right_bc):len(right_bc)+5]
    if left_enzsite == 'TACAA' and right_enzsite == 'TGCAG':
        return 'crick'
    elif right_enzsite == 'TACAA' and left_enzsite == 'TGCAG':
        return 'watson'
    elif right_enzsite == 'TGCAG' and left_enzsite == 'TGCAG':
        return 'gbs'
    else:
        #enzyme sites have not been establshed correctly, establish read 
        #type based on closest matching enz site and CG count.
        watson_count = left_read[1].count('G') + right_read[1].count('C') +0.001
        crick_count = left_read[1].count('C') + right_read[1].count('G') +0.001
        left_distance = Levenshtein.distance(lr_enz_left, left_enzsite)
        right_distance = Levenshtein.distance(rr_enz_right, right_enzsite)
        if left_distance < right_distance:
            #left enz_site should be leading since it has fewer mismatches.
            if left_enzsite == 'TACAA' and crick_count/float(watson_count)>2:
                return 'crick'
            else:
                return 'nodet'
        else:
            if left_enzsite == 'TGCAG' and watson_count/float(crick_count)>2:
                return 'watson'
            else:
                return 'nodet'
Beispiel #16
0
 def identify_similar_teams(cls):
   matches = []
   final = []
   teams = Team.objects.all()
   for team_primary in teams:
     p_school = team_primary.team_code[:-3]
     for team_sec in teams:
       s_school = team_sec.team_code[:-3]
       if (p_school != s_school):
         if Levenshtein.ratio(team_primary.team_code, team_sec.team_code) > 0.50:
           matches.append((team_primary, team_sec))
   for t1, t2 in matches:
     if Levenshtein.ratio(t1.team_name, t2.team_name) > .60:
       data = {}
       data["team1"] = TeamDataFetch.process_teams(TeamSerializer(Team.objects.get(id=t1.id)).data)
       data["team2"] = TeamDataFetch.process_teams(TeamSerializer(Team.objects.get(id=t2.id)).data)
       data["score"] = Levenshtein.ratio(t1.team_name, t2.team_name) + Levenshtein.ratio(t1.team_code, t2.team_code)
       final.append(data)
   processed_data = []
   for obj in final:
     if not processed_data:
       processed_data.append(obj)
     elif (obj not in processed_data) and (not SimilarTeams.double_count_present(obj, processed_data)):
       processed_data.append(obj)
   return processed_data
Beispiel #17
0
    def _match_user_agent(cls, user_agent):
        device = cls.objects.filter(user_agent=user_agent).order_by('-actual_device_root')[:1]

        if len(device):
            return device[0]
        else:
            if settings.UA_PREFIX_MATCHING:
                #~ Try more flexible matching, from 1/3rd to 1/10th of the original UA string
                #~ We break out as soon as we get a match (or matches, in which case we use Levenshtein 
                #~ distance to determine which one we want to use) or if the shortened UA string is less
                #~ than 5 characters long
                devices = None
                for factor in range(3,10):
                    if len(user_agent)/factor <= 5: break
                    devices = cls._match_partial_user_agent(user_agent,factor)
                    if len(devices): break
                
                if len(devices):
                    user_agent = force_unicode(user_agent)
                    best = reduce(
                        lambda x,y: Levenshtein.distance(user_agent, x.user_agent) < Levenshtein.distance(user_agent, y.user_agent) and x or y,
                        devices,
                    )

                    if Levenshtein.distance(user_agent, best.user_agent) <= settings.UA_PREFIX_MATCHING_MAX_DISTANCE:
                        return best
            
            if settings.UA_GENERIC_FALLBACK:
                # Try to match with generic properties
                # :TODO:
                raise NotImplementedError, 'Generic properties matching is not implemented'

        raise NoMatch, "Can't find a match in currently installed WURFL table for user_agent `%s`" % user_agent
Beispiel #18
0
 def prettyprint(self):
     print "Timestamp:     " + self.data["timeseed"]
     print "Expected Data: " + self.data["expected_data"]
     print "PSK31 Data:    " + self.data["psk_data"]
     print "PSK31 Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"]))
     print "DOMEX8 Data:   " + self.data["domex_data"]
     print "DOMEX Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]))
    def getScoredMatches(self, word, possibilities, num, score, context=None):
        """ Take a word,
            compare it to a list of possibilities,
            return max. num matches > score).
        """
        self._checkPermission(context)
        if not USE_LEVENSHTEIN:
            # No levenshtein module around. Fall back to difflib
            return difflib.get_close_matches(word, possibilities, num, score)

        # Levenshtein is around, so let's use it.
        res = []

        # Search for all similar terms in possibilities
        if isinstance(word, str):
            oword = unicode(word, 'utf-8')
        else:
            oword = word.encode('utf-8')

        for item in possibilities:
            if isinstance(item, type(word)):
                lscore = Levenshtein.ratio(word, item)
            elif isinstance(item, type(oword)):
                lscore = Levenshtein.ratio(oword, item)
            else:
                raise ValueError, "%s is not a normal, or unicode string" % item
            if lscore > score:
                res.append((item, lscore))

        # Sort by score (high scores on top of list)
        res.sort(lambda x, y: -cmp(x[1], y[1]))

        # Return first n terms without scores
        return [item[0] for item in res[:num]]
Beispiel #20
0
def GetSuggestionsFromCorpus(options, corpus, numSuggestions = 5, MAXDIST = 4, MINRATIO = 0.4):
  # if len(corpus) == 0:
  #   return GetSuggestions(options)

  suggs = []
  for word in corpus: 
    count = corpus[word]
    mindist = 999
    maxratio = 0
    for option in options:
      dist = Levenshtein.distance(word, option)  # Edit Distance
      ratio = Levenshtein.ratio(word, option)  # Edit Distance
      if mindist > dist:
        mindist = dist
      if maxratio < ratio:
        maxratio = ratio
    if mindist > MAXDIST or maxratio < MINRATIO: 
      continue
    if mindist == 0:  # do not want same ones
      continue

    score = 1.0 / mindist * math.log(int(count) + 1.0)
    suggs.append((score, word, mindist, count, maxratio))

  suggs.sort(reverse=True)
  return suggs[:numSuggestions]
  def SuggestCand(words):
    ret = []
    total_edits = 0
    total_err_ratio = 0.0
    iscorrect = True
    for word in words:
      if en_dict.check(word) or corpus_dict.check(word):
        ret.append(word)
      else:
        iscorrect = False
        sugg_list = corpus_dict.suggest(word)
        if len(sugg_list) == 0: 
          return None, 0, 0.
        # sorted_suggs = sorted(sugg_list, key=lambda sw: Levenshtein.ratio(word, sw), reverse=True)
        best_sugg = max(sugg_list, key=lambda sw: Levenshtein.ratio(word, sw))
        for w in best_sugg.split(' '):
          ret.append(w)
        total_edits += Levenshtein.distance(word, best_sugg)
        total_err_ratio += 1.0 - Levenshtein.ratio(word, best_sugg)
    if iscorrect: 
      return None, 0, 0.

    if total_err_ratio > RATIO_THRESHOLD: return None, 0, 0.

    return ret, total_edits, total_err_ratio
def get_all_scores_before_after_no_change_avg(specificity,length_of_no_change, graph=False):
    avgs = []
    for new,old in find_patterns_of_no_change(length_of_no_change,specificity).iteritems():
        inew=new[0]
        vnew=new[1]
        iold = old[0]
        vold = old[1]
        all_indices = [get_future_mappings(vnew-length_of_no_change,x,length_of_no_change*2) for x,y in enumerate(current_paratexts[vnew])]
        
        for k, indices in enumerate(all_indices):
            if indices:
                before = 0
                after = 0
                for offset, paraindex in enumerate(indices):
                    if offset < len(indices)-1:
                        if offset < length_of_no_change:
                            before += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]]))
                        else:
                            after += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]]))
                before = before/length_of_no_change
                after = after/length_of_no_change
                avgs.append(before-after)
    if graph:
        plt.hist(avgs, bins=21)
        plt.show()
    
    return avgs
def get_neighbor_scores_before_after_no_change_avg(specificity,length_of_no_change, graph=False):
    avgs = []
    for new,old in find_patterns_of_no_change(length_of_no_change,specificity).iteritems():
        inew=new[0]
        vnew=new[1]
        iold = old[0]
        vold = old[1]
        back1 = get_backward_mapping(vnew,inew+1, length_of_no_change)
        back2 = get_backward_mapping(vnew,inew-1, length_of_no_change)
        all_indices = [get_future_mappings(vnew-length_of_no_change,back1,length_of_no_change*2), get_future_mappings(vnew-length_of_no_change,back2,length_of_no_change*2)]            
        for indices in all_indices:
            if indices:
                before = 0
                after = 0
                for offset, paraindex in enumerate(indices):
                    if offset < len(indices)-1:
                        if offset < length_of_no_change:
                            before += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]]))
                        else:
                            after += (Levenshtein.ratio(current_paratexts[vnew-length_of_no_change+offset][paraindex], current_paratexts[vnew-length_of_no_change+offset+1][indices[offset+1]]))
                before = before/length_of_no_change
                after = after/length_of_no_change
                avgs.append(before-after)

    return avgs
Beispiel #24
0
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
	reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
	multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length)

	edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence)
	if len(edit_ops) > 2:
		logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence)
		logger.info("Globally apply %s", edit_ops)
	start, end = 0, 0
	while start < len(edit_ops):
		if edit_ops[start] == 'replace':
			atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence)
			# print atomic_sequence
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
			start += 1
		else:
			start_e = edit_ops[start]
			end = start + 1
			while (end < len(edit_ops)
				   and edit_ops[end][0] == start_e[0]
				   and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])):
				end += 1
			edit_op_to_apply = edit_ops[start:end]
			start = end
			logger.info("Will apply %s", edit_op_to_apply)
			atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence)
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
		# record each atomic alteration
		logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence)
		yield atomic_sequence, atomic_path
def janalysis(read):
  
  hold_j = j_key.findall(read)
  
  if hold_j:
    if len(hold_j) > 1:
      counts['multiple_j_matches'] += 1
      return
  
    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
    
    j_seq_end = hold_j[0][1] + len(hold_j[0][0])      
        
    start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
    
    if start_j_j_dels: # If the number of deletions has been found
      return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
          
  else:
    
    hold_j1 = half1_j_key.findall(read)
    if hold_j1:
      for i in range(len(hold_j1)):
        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
        for k in indices:
          if len(j_seqs[k]) == len(read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
            if lev.hamming( j_seqs[k], read[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
              counts['jerr2'] += 1
              j_match = k
              temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
              j_seq_end = hold_j1[i][1] + len(hold_j1[i][0]) + j_half_split                                              
              start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
              if start_j_j_dels:
                return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
      counts['foundj1notj2'] += 1
      return              
            
    else:        
      hold_j2 = half2_j_key.findall(read)
      if hold_j2:
        for i in range(len(hold_j2)):
          indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
          for k in indices:
            if len(j_seqs[k]) == len(read[hold_j2[i][1]-j_half_split:hold_j2[i][1]-j_half_split+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
              if lev.hamming( j_seqs[k], read[hold_j2[i][1]-j_half_split:hold_j2[i][1]+len(j_seqs[k])-j_half_split] ) <= 1:
                counts['jerr1'] += 1
                j_match = k
                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - j_half_split # Finds where the start of a full J would be
                j_seq_end = hold_j2[i][1] + len(hold_j2[i][0])                                                
                start_j_j_dels = get_j_deletions( read, j_match, temp_start_j, j_regions )
                if start_j_j_dels:
                  return j_match, start_j_j_dels[0], start_j_j_dels[1], j_seq_end
        counts['foundv2notv1'] += 1
        return
      
      else:
         counts['no_j_assigned'] += 1
         return
def vanalysis(read):

  hold_v = v_key.findall(read)
  
  if hold_v:
    if len(hold_v) > 1:
      counts['multiple_v_matches'] += 1
      return

    v_match = v_seqs.index(hold_v[0][0]) # Assigns V
    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
    
    v_seq_start = hold_v[0][1]      
    end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )      
    if end_v_v_dels: # If the number of deletions has been found
      return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      
  else:
    
    hold_v1 = half1_v_key.findall(read)
    
    if hold_v1:
      for i in range(len(hold_v1)):
        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
        for k in indices:
          if len(v_seqs[k]) == len(read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
            if lev.hamming( v_seqs[k], read[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
              counts['verr2'] += 1
              v_match = k
              temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
              end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
              if end_v_v_dels:
                v_seq_start = hold_v1[i][1]  
                return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
      counts['foundv1notv2'] += 1
      return
    
    else:
      
      hold_v2 = half2_v_key.findall(read)
      if hold_v2:
        for i in range(len(hold_v2)):
          indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
          for k in indices:
            if len(v_seqs[k]) == len(read[hold_v2[i][1]-v_half_split:hold_v2[i][1]-v_half_split+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
              if lev.hamming( v_seqs[k], read[hold_v2[i][1]-v_half_split:hold_v2[i][1]+len(v_seqs[k])-v_half_split] ) <= 1:
                counts['verr1'] += 1
                v_match = k
                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - v_half_split - 1 # Finds where the end of a full V would be
                end_v_v_dels = get_v_deletions( read, v_match, temp_end_v, v_regions )
                if end_v_v_dels:
                  v_seq_start = hold_v2[i][1] - v_half_split      
                  return v_match, end_v_v_dels[0], end_v_v_dels[1], v_seq_start
        counts['foundv2notv1'] += 1
        return
              
      else:
        counts['no_vtags_found'] += 1
        return
 def check_sure(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95:
         return True
     else:
         return False
 def check_cons(name1, name2):
     ratio = Levenshtein.ratio(name1, name2)
     jaro = Levenshtein.jaro(name1, name2)
     jaro_winkler = Levenshtein.jaro_winkler(name1, name2)
     if ratio > .6 or jaro > .7 or jaro_winkler > .7:
         return True
     else:
         return False
def find_future_of_linked_paras(sig_change_threshold, min_past,change_occured_threshold, link_threshold, n_into_future_min, n_into_future_max):    
    #find sig changes linked paras to validate in future
    linked_paras_with_sig_change = find_para_linked_to_sig_change(sig_change_threshold,min_past,change_occured_threshold,link_threshold)
    total_prev_links = 0
    found_link = 0
    looked_at = 0
    #construct new dictionary to validate
    validation_dictionary = copy.deepcopy(linked_paras_with_sig_change)
    for v in linked_paras_with_sig_change:
        for i,link in enumerate(linked_paras_with_sig_change[v]):
            total_prev_links += 1
            t = None
            get_fm = get_future_mappings_indefinite(v,link[0],n_into_future_min)
            get_fm2 = get_future_mappings_indefinite(v,link[1],n_into_future_min)
            if not (get_fm and get_fm2):
                validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],False)
            else:
                looked_at+=1
                min_len = min(len(get_fm), len(get_fm2), n_into_future_max+1)
                get_fm = get_fm[:min_len]
                get_fm2 = get_fm2[:min_len]
                t = (get_fm[-1],get_fm2[-1])
                
                linked = False
                #calculate if still linked (2 dimensional??)
                change_scores1 = []
                change_scores2 = []
                #version is v
                #get_fmX is indices      
                for t, para_index in enumerate(get_fm):
                    if t< len(get_fm)-1: 
                        change_scores1.append(Levenshtein.ratio(current_paratexts[v+t][para_index], current_paratexts[v+t+1][get_fm[t+1]]))
                for t, para_index in enumerate(get_fm2):
                    if t< len(get_fm2)-1: 
                        change_scores2.append(Levenshtein.ratio(current_paratexts[v+t][para_index], current_paratexts[v+t+1][get_fm2[t+1]]))
                
                num_first_changes = 0
                for change in change_scores1:
                    if change < change_occured_threshold:
                        num_first_changes += 1
                num_second_changes = 0
                for t, change in enumerate(change_scores2):
                    if change < change_occured_threshold and change_scores1[t] < change_occured_threshold:
                            num_second_changes += 1
                avg = 0
                if num_first_changes == 0 and num_second_changes == 0:
                    avg = 1.0
                elif num_first_changes > 0:
                    avg = float(num_second_changes)/num_first_changes
                        
                #only do from current verision to this n versions away
                if avg > link_threshold:
                    validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],avg)
                    found_link +=1
                else: 
                    validation_dictionary[v][i] = (linked_paras_with_sig_change[v][i],False, avg)
    return validation_dictionary, (found_link, looked_at, total_prev_links)
def karsilastir(a, b):
	if a == u'None' and b == u'None': return '24'
	elif a == u'None': return '22'
	elif b == u'None': return '21'
	elif Levenshtein.ratio(a,b) < minratio:
		if len(a) > len(b): return '25'
		elif len(b) > len(a): return '26'
		else: return '23'
	else: return str(Levenshtein.ratio(a,b))
Beispiel #31
0
def are_barcodes_equivalent(bc1, bc2, threshold):
    if lev.distance(bc1, bc2) <= threshold:
        return 1
    else:
        return 0
        if child.tag == 'abstract' and child.attrib == {"lang": "tr"}:
            ozet4 = unicode(child.text)

    for keywords in root1.findall('keywords'):
        if keywords.attrib == {"lang": "en"}:
            for keyword in keywords.findall('keyword'):
                ahmet = ET.SubElement(keywords_en, 'keyword')
                anahtar = unicode(keyword.text)
                liste1 = ['20']
                for keys in root2.findall('keywords'):
                    if keys.attrib == {"lang": "en"}:
                        if liste1 == ['20']: liste1 = []
                        for key in keys.findall('keyword'):
                            anahtar2 = unicode(key.text)
                            liste1.append(
                                str(Levenshtein.ratio(anahtar, anahtar2)))
                    ahmet.text = max(liste1)
        elif keywords.attrib == {"lang": "tr"}:
            for keyword in keywords.findall('keyword'):
                ahmet = ET.SubElement(keywords_tr, 'keyword')
                anahtar = unicode(keyword.text)
                liste2 = ['20']
                for keys in root2.findall('keywords'):
                    if keys.attrib == {"lang": "tr"}:
                        if liste2 == ['20']: liste2 = []
                        for key in keys.findall('keyword'):
                            anahtar2 = unicode(key.text)
                            liste2.append(
                                str(Levenshtein.ratio(anahtar, anahtar2)))
                    ahmet.text = max(liste2)
Beispiel #33
0
def cred_management_action():

    supplied = request.args.get('term')
    action = request.args.get('action')
    section = request.args.get('section')
    extensive = request.args.get('extensive')
    extensive = True if extensive == "true" else False

    if extensive:
        #collectDico
        AllUsernameInRedis = r_serv_cred.hgetall(REDIS_KEY_ALL_CRED_SET).keys()
    uniq_num_set = set()
    if action == "seek":
        possibilities = mixUserName(supplied, extensive)
        for poss in possibilities:
            num = r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET, poss)
            if num is not None:
                uniq_num_set.add(num)
            for num in r_serv_cred.smembers(poss):
                uniq_num_set.add(num)
        #Extensive /!\
        if extensive:
            iter_num = 0
            tot_iter = len(AllUsernameInRedis) * len(possibilities)
            for tempUsername in AllUsernameInRedis:
                for poss in possibilities:
                    #FIXME print progress
                    if (iter_num % int(tot_iter / 20) == 0):
                        #print("searching: {}% done".format(int(iter_num/tot_iter*100)), sep=' ', end='\r', flush=True)
                        print("searching: {}% done".format(
                            float(iter_num) / float(tot_iter) * 100))
                    iter_num += 1

                    if poss in tempUsername:
                        num = (r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET,
                                                tempUsername))
                        if num is not None:
                            uniq_num_set.add(num)
                        for num in r_serv_cred.smembers(tempUsername):
                            uniq_num_set.add(num)

    data = {'usr': [], 'path': [], 'numPaste': [], 'simil': []}
    for Unum in uniq_num_set:
        levenRatio = 2.0
        username = (r_serv_cred.hget(REDIS_KEY_ALL_CRED_SET_REV, Unum))

        # Calculate Levenshtein distance, ignore negative ratio
        supp_splitted = supplied.split()
        supp_mixed = supplied.replace(' ', '')
        supp_splitted.append(supp_mixed)
        for indiv_supplied in supp_splitted:
            levenRatio = float(Levenshtein.ratio(indiv_supplied, username))
            levenRatioStr = "{:.1%}".format(levenRatio)

        data['usr'].append(username)

        allPathNum = list(
            r_serv_cred.smembers(REDIS_KEY_MAP_CRED_TO_PATH + '_' + Unum))

        data['path'].append(allPathNum)
        data['numPaste'].append(len(allPathNum))
        data['simil'].append(levenRatioStr)

    to_return = {}
    to_return["section"] = section
    to_return["action"] = action
    to_return["term"] = supplied
    to_return["data"] = data

    return jsonify(to_return)
Beispiel #34
0
def are_similar(name1, name2):
    name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2))
    ratio = Levenshtein.jaro_winkler(name1, name2)
    return ratio >= 0.8 or name1 in name2 or name2 in name1
            company_type	WANT
            date_of_creation	MAYBE
            description	
            description_identifier	
            kind	
            links	
            matches	
            snippet	
            title (name of company) WANT
            '''
        api_ext= a[['company_number', 'title', 'address_snippet','company_type', 'company_status']].loc[0]
        api_ext=api_ext.append(pd.Series(company, index=['subcontractor_name']))
        api_ext= api_ext.str.lower()
        api_ext= api_ext.map(lambda x: x.replace('ltd','').replace('limited',''))
        api_ext= api_ext.str.strip()
        api_ext=api_ext.append(pd.Series(Levenshtein.distance(str(company), str(api_ext.loc['title'])), index=['similarity_distance']))
        api_ext=api_ext.append(pd.Series(Levenshtein.ratio(str(company), str(api_ext.loc['title'])), index=['similarity_ratio']))
        api_ext=api_ext.append(pd.Series(Levenshtein.jaro(str(company), str(api_ext.loc['title'])), index=['similarity_jaro']))
        company_data.append(api_ext)
        '''JSONDecodeError: Expecting value'''
    except Exception as e: print(e, company), missed_subs.append(company)
    time.sleep(0.5)
    

    '''a lot of companies look like they should be in companies house. why aren't they fetching results? that is the issue.
    Companies house has a request limit of 600 per 5 minutes. timing repeat operations every 0.5 seconds means I'm completing 600 requests per 5 minutes.
    Binning a list of missed subcontractors showed that companies were being extracted in each instance, the problem was the request limit.
    Also noted on API forums that some clients simply aren't on the companies house api, so they may be on the CH system but they're not visible under the API.'''

df_cd=pd.DataFrame(company_data)
Beispiel #36
0
def main():
    USEANSWER = True
    random.seed(0)
    # np.random.seed(0)
    questions, tables, table_idx = load_data()
    random.shuffle(questions)
    table_name = list(table_idx)

    # test = ['regents-02', 'regents-03', 'regents-08', 'regents-13', 'regents-17', 'regents-19', 'regents-22',
    #         'regents-25&26', 'regents-42', 'monarch-44', 'monarch-47', 'monarch-50', 'monarch-53', 'monarch-57',
    #         'monarch-62', 'monarch-64']
    # train = [t for t in table_idx if t not in test]
    # train_questions = [q for q in questions if q[7] in train]
    # test_questions = [q for q in questions if q[7] in test]

    punc_table = str.maketrans({key: None for key in string.punctuation})

    trainx, trainy = [], []

    cap_corpus = {}
    header_corpus = {}
    cell_corpus = {}
    for t in table_name:
        table = tables[t]
        cap_corpus[t] = nltk.word_tokenize(table_idx[t].translate(punc_table).lower())
        header = list(table)
        temp = []
        for h in header:
            if not h.startswith('Unnamed'):
                temp += nltk.word_tokenize(h.translate(punc_table).lower())
        header_corpus[t] = temp
        cells = table.applymap(str).values
        body = ""
        for row in cells:
            body += " ".join(row) + " "
        cell_corpus[t] = nltk.word_tokenize(body.translate(punc_table).lower())

    vocab = []
    for t in table_name:
        vocab += cap_corpus[t]
        vocab += header_corpus[t]
        vocab += cell_corpus[t]
    vocab = set(vocab)

    if USEANSWER:
        queries = [nltk.word_tokenize(
            q[0].translate(punc_table).lower() + " " + " ".join(q[2:6]).translate(punc_table).lower()) for q in
                         questions]
    else:
        queries = [nltk.word_tokenize(q[0].translate(punc_table).lower()) for q in questions]

    # Compute bm25 scores and idf scores
    cap_bm25 = QueryProcessor(queries, cap_corpus)
    cap_results = cap_bm25.run()
    cap_idf = cap_bm25.idf()

    header_bm25 = QueryProcessor(queries, header_corpus)
    header_results = header_bm25.run()
    header_idf = header_bm25.idf()

    cell_bm25 = QueryProcessor(queries, cell_corpus)
    cell_results = cell_bm25.run()
    cell_idf = cell_bm25.idf()

    for i in range(8200):
        q = questions[i]
        q_tok = queries[i]
        tab = q[7]
        table = tables[tab]

        cap = cap_corpus[tab]
        header = header_corpus[tab]
        body = cell_corpus[tab]

        capc = collections.Counter(cap)
        headerc = collections.Counter(header)
        bodyc = collections.Counter(body)

        x = list()
        # Query length
        x.append(len(q_tok))

        # Sum of idf scores
        x.append(sum(cap_idf[i]))
        x.append(sum(header_idf[i]))
        x.append(sum(cell_idf[i]))

        # # Max of idf scores
        # x.append(max(cap_idf[i]))
        # x.append(max(header_idf[i]))
        # x.append(max(cell_idf[i]))
        #
        # # Average of idf scores
        # x.append(mean(cap_idf[i]))
        # x.append(mean(header_idf[i]))
        # x.append(mean(cell_idf[i]))

        # Num of columns
        x.append(len(list(table)))

        # LCS normalized by length of query
        que = " ".join(q_tok)
        cap = " ".join(cap)
        header = " ".join(header)
        body = " ".join(body)

        x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que))
        x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que))
        x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que))

        # Term frequency
        cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok]
        header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok]
        body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok]

        # Sum of term frequency
        x.append(sum(cap_tf))
        x.append(sum(header_tf))
        x.append(sum(body_tf))

        # # Max of term frequency
        # x.append(max(cap_tf))
        # x.append(max(header_tf))
        # x.append(max(body_tf))
        #
        # # Average of term frequency
        # x.append(mean(cap_tf))
        # x.append(mean(header_tf))
        # x.append(mean(body_tf))

        # BM25 scores
        x.append(cap_results[i][tab])
        x.append(header_results[i][tab])
        x.append(cell_results[i][tab])

        # Fix typo
        cap_typo = []
        header_typo = []
        cell_typo = []
        for tok in q_tok:
            if tok not in vocab:
                cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc))
                header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc))
                cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc))

        if not cap_typo:
            # x += [0, 0, 0, 0, 0, 0, 0, 0, 0]
            x += [0, 0, 0]
        else:
            x.append(sum(cap_typo))
            x.append(sum(header_typo))
            x.append(sum(cell_typo))

            # x.append(max(cap_typo))
            # x.append(max(header_typo))
            # x.append(max(cell_typo))
            #
            # x.append(mean(cap_typo))
            # x.append(mean(header_typo))
            # x.append(mean(cell_typo))

        trainx.append(x)
        trainy.append(1)

        # Negative samples
        neg_samp = random.sample(table_name, 2)
        while neg_samp[0] == tab or neg_samp[1] == tab:
            neg_samp = random.sample(table_name, 2)
        for tab in neg_samp:
            table = tables[tab]
            cap = cap_corpus[tab]
            header = header_corpus[tab]
            body = cell_corpus[tab]

            capc = collections.Counter(cap)
            headerc = collections.Counter(header)
            bodyc = collections.Counter(body)

            x = list()
            # Query length
            x.append(len(q_tok))

            # Sum of idf scores
            x.append(sum(cap_idf[i]))
            x.append(sum(header_idf[i]))
            x.append(sum(cell_idf[i]))

            # # Max of idf scores
            # x.append(max(cap_idf[i]))
            # x.append(max(header_idf[i]))
            # x.append(max(cell_idf[i]))
            #
            # # Average of idf scores
            # x.append(mean(cap_idf[i]))
            # x.append(mean(header_idf[i]))
            # x.append(mean(cell_idf[i]))

            # Num of columns
            x.append(len(list(table)))

            # LCS normalized by length of query
            que = " ".join(q_tok)
            cap = " ".join(cap)
            header = " ".join(header)
            body = " ".join(body)

            x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que))
            x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que))
            x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que))

            # Term frequency
            cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok]
            header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok]
            body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok]

            # Sum of term frequency
            x.append(sum(cap_tf))
            x.append(sum(header_tf))
            x.append(sum(body_tf))

            # # Max of term frequency
            # x.append(max(cap_tf))
            # x.append(max(header_tf))
            # x.append(max(body_tf))
            #
            # # Average of term frequency
            # x.append(mean(cap_tf))
            # x.append(mean(header_tf))
            # x.append(mean(body_tf))

            # BM25 scores
            x.append(cap_results[i][tab])
            x.append(header_results[i][tab])
            x.append(cell_results[i][tab])

            # Fix typo
            cap_typo = []
            header_typo = []
            cell_typo = []
            for tok in q_tok:
                if tok not in vocab:
                    cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc))
                    header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc))
                    cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc))

            if not cap_typo:
                # x += [0, 0, 0, 0, 0, 0, 0, 0, 0]
                x += [0, 0, 0]
            else:
                x.append(sum(cap_typo))
                x.append(sum(header_typo))
                x.append(sum(cell_typo))

                # x.append(max(cap_typo))
                # x.append(max(header_typo))
                # x.append(max(cell_typo))
                #
                # x.append(mean(cap_typo))
                # x.append(mean(header_typo))
                # x.append(mean(cell_typo))

            trainx.append(x)
            trainy.append(0)
    inplen = len(trainx[0])
    trainx = np.array(trainx)
    trainy = np.array(trainy)
    # print(trainx[:3])
    # print(trainy[:3])

    Xtest = []
    Ytest = []
    for i in range(8200, len(questions)):
        q = questions[i]
        q_tok = queries[i]
        testx = []
        for tab in table_name:
            table = tables[tab]
            cap = cap_corpus[tab]
            header = header_corpus[tab]
            body = cell_corpus[tab]

            capc = collections.Counter(cap)
            headerc = collections.Counter(header)
            bodyc = collections.Counter(body)

            x = list()
            # Query length
            x.append(len(q_tok))

            # Sum of idf scores
            x.append(sum(cap_idf[i]))
            x.append(sum(header_idf[i]))
            x.append(sum(cell_idf[i]))

            # # Max of idf scores
            # x.append(max(cap_idf[i]))
            # x.append(max(header_idf[i]))
            # x.append(max(cell_idf[i]))
            #
            # # Average of idf scores
            # x.append(mean(cap_idf[i]))
            # x.append(mean(header_idf[i]))
            # x.append(mean(cell_idf[i]))

            # Num of columns
            x.append(len(list(table)))

            # LCS normalized by length of query
            que = " ".join(q_tok)
            cap = " ".join(cap)
            header = " ".join(header)
            body = " ".join(body)

            x.append(SequenceMatcher(None, que, cap).find_longest_match(0, len(que), 0, len(cap)).size / len(que))
            x.append(SequenceMatcher(None, que, header).find_longest_match(0, len(que), 0, len(header)).size / len(que))
            x.append(SequenceMatcher(None, que, body).find_longest_match(0, len(que), 0, len(body)).size / len(que))

            # Term frequency
            cap_tf = [capc[tok] / sum(capc.values()) for tok in q_tok]
            header_tf = [headerc[tok] / sum(headerc.values()) for tok in q_tok]
            body_tf = [bodyc[tok] / sum(bodyc.values()) for tok in q_tok]

            # Sum of term frequency
            x.append(sum(cap_tf))
            x.append(sum(header_tf))
            x.append(sum(body_tf))

            # # Max of term frequency
            # x.append(max(cap_tf))
            # x.append(max(header_tf))
            # x.append(max(body_tf))
            #
            # # Average of term frequency
            # x.append(mean(cap_tf))
            # x.append(mean(header_tf))
            # x.append(mean(body_tf))

            # BM25 scores
            x.append(cap_results[i][tab])
            x.append(header_results[i][tab])
            x.append(cell_results[i][tab])

            # Fix typo
            cap_typo = []
            header_typo = []
            cell_typo = []
            for tok in q_tok:
                if tok not in vocab:
                    cap_typo.append(max(Levenshtein.ratio(tok, cc) for cc in capc))
                    header_typo.append(max(Levenshtein.ratio(tok, cc) for cc in headerc))
                    cell_typo.append(max(Levenshtein.ratio(tok, cc) for cc in bodyc))

            if not cap_typo:
                #x += [0, 0, 0, 0, 0, 0, 0, 0, 0]
                x += [0, 0, 0]
            else:
                x.append(sum(cap_typo))
                x.append(sum(header_typo))
                x.append(sum(cell_typo))

                # x.append(max(cap_typo))
                # x.append(max(header_typo))
                # x.append(max(cell_typo))
                #
                # x.append(mean(cap_typo))
                # x.append(mean(header_typo))
                # x.append(mean(cell_typo))

            testx.append(x)
        testx = np.array(testx)
        Xtest.append(testx)
        Ytest.append(q[7])

    # Test
    for _ in range(5):
        # lm = RandomForestRegressor(n_estimators=1000)
        # lm.fit(trainx, trainy)
        model = Sequential()
        model.add(Dense(32, input_shape=(inplen,), activation='tanh'))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()
        model.compile(optimizer='adadelta', loss='binary_crossentropy')
        model.fit(trainx, trainy, batch_size=100, epochs=40)

        ap1 = 0
        ap2 = 0
        ap3 = 0
        saveresult = []
        for i in range(len(Xtest)):
            q = questions[8200 + i]
            pre = model.predict(Xtest[i]).reshape(1, -1)
            predictions = pre[0].argsort()[::-1]
            # pre = lm.predict(Xtest[i])
            # predictions = pre.argsort()[::-1]
            saveresult.append(pre[0])
            if table_name[predictions[0]] == Ytest[i]:
                ap1 += 1
                ap2 += 1
                ap3 += 1
            elif table_name[predictions[1]] == Ytest[i]:
                ap2 += 0.5
                ap3 += 0.5
            elif table_name[predictions[2]] == Ytest[i]:
                ap3 += 1/3
            # else:
            #     print(q[0])
            #     print(q[2:6])
            #     print("Pre:", table_name[predictions[0]], ", gold:", q[7])
        # np.save('ft', np.array(saveresult))
        print(ap1 / len(Xtest))
        print(ap2 / len(Xtest))
        print(ap3 / len(Xtest))
    version1 = row1.version
    for index2, row2 in sampled_sequence_df.iterrows():
        version2 = row2.version
        if version1 != version2:

            country_code1 = row1["country_code"]
            country_code2 = row2["country_code"]

            region_name1 = row1["region_name"]
            region_name2 = row2["region_name"]

            covid_nucleic_acid_sequence1 = row1["sequence"]
            covid_nucleic_acid_sequence2 = row2["sequence"]

            lev_dist = Levenshtein.distance(covid_nucleic_acid_sequence1,
                                            covid_nucleic_acid_sequence2)

            divider = len(covid_nucleic_acid_sequence1
                          ) if len(covid_nucleic_acid_sequence1) > len(
                              covid_nucleic_acid_sequence2) else len(
                                  covid_nucleic_acid_sequence2)
            lev_dist = lev_dist / divider
            lev_dist = 1 - lev_dist
            print(str(index) + " " + str(index2) + " " + str(lev_dist))
            s = pd.Series([
                version1, country_code1, region_name1, version2, country_code2,
                region_name2, lev_dist
            ],
                          index=dist_df.columns)
            dist_df = pd.DataFrame([s])
            if i == 0:
Beispiel #38
0
    def parser_list(self):
        result = Result([], True)
        self.browser.save_page_screenshot('./images/n.png')
        page_sources = self.get_all_html()
        response_status = self.get_response_status()
        is_all_parser = False

        if self.task.get('parser_class'):
            parser = self.__parsers.get(self.task.get('parser_class'))
            result = self._result_pipeline(parser.parser_main,
                                           page_sources=page_sources,
                                           task=self.task,
                                           logger=self.logger)
            result.parser_class = parser.parser_class
            if len(result.get_result()) < 3:
                is_all_parser = True
                self.task['count'] = 0
        else:
            is_all_parser = True
            self.task['count'] = 0

        if is_all_parser:
            for parser in self.__parsers.values():
                if not result.is_next:
                    break
                result = self._result_pipeline(parser.parser_main,
                                               page_sources=page_sources,
                                               task=self.task,
                                               logger=self.logger)
                result.parser_class = parser.parser_class
                self.task['parser_class'] = parser.parser_class
                self.task['count'] = len(result.get_result())
                self.task['status'] = str(response_status)

        # 计算提取结果相似度(采用计算莱文斯坦比) 、 (两个文本长度之和-类编辑距离)/两个文本长度之和
        val = self.clear("".join(
            [res.get('title') for res in result.get_result()]))
        if self.first_result_val:
            first_similarity = Levenshtein.ratio(self.first_result_val, val)
            if first_similarity > 0.92:
                self.logger.info(
                    f'当前取出的结果与第一页取出的结果相似,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}'
                )
                return False
        else:
            self.first_result_val = val

        if self.last_result_val:
            last_similarity = Levenshtein.ratio(self.last_result_val, val)
            if last_similarity > 0.92:
                self.logger.info(
                    f'当前取出结果与上一页取出的结果相似,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}'
                )
                return False
        self.last_result_val = val

        if not val:
            self.logger.info(
                f'未取出结果,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}'
            )
            return False

        turn_page_conf = self.turn_page(page_sources)
        result.cookies = self.browser.browser.get_cookies()
        result.browser = self.browser
        for result_process in self.__results.values():
            result_process.result_main(result=result,
                                       task=self.task,
                                       logger=self.logger,
                                       response_status=response_status,
                                       turn_page_conf=turn_page_conf)

        dates = [x.get('send_date') for x in result.get_result()]
        min_date = self.get_min_date(dates)
        if min_date and datetime.datetime.now(
        ) > min_date > self.min_date and self.task.get('end_send_date'):
            if min_date < datetime.datetime.utcfromtimestamp(
                    self.task.get('end_send_date')):
                self.logger.info(
                    f'获取的留言日期时间已小于上次抓取的最大时间,退出,start_url:{self.task.get("start_url")}'
                )
                return False

        if self.page_num >= TURN_PAGE_COUNT:
            self.logger.info(
                f'翻到指定页数,退出:当前第{self.page_num}页,start_url:{self.task.get("start_url")}'
            )
            return False

        # 执行翻页,并判断翻页情况
        if self.__turn_page_action(turn_page_conf):
            self.logger.info(f'翻页成功,当前{self.page_num}页')
            return True
        else:
            return False
	def _levenshteinDistance(self, str1, str2):
		return Levenshtein.distance(str1, str2)
Beispiel #40
0
    ".lettr (language encoded and typed text with \"raspa\") file with all the information about the processed files (.lett file is also valid)",
    dest="lettr",
    required=True)
options = oparser.parse_args()

if options.ridx == None:
    reader = sys.stdin
else:
    reader = open(options.ridx, "r")

index = {}
documents = {}
readLETT(options.lettr, documents)

for i in reader:
    fields = i.strip().split("\t")
    #The document must have at least one candidate
    if len(fields) > 1:
        len_s = len(documents[int(fields[0])])
        sys.stdout.write(str(fields[0]))
        for j in range(1, len(fields)):
            candidate = fields[j]
            candidateid = int(fields[j].split(":")[0])
            len_t = len(documents[candidateid])
            dist = Levenshtein.distance(documents[int(fields[0])],
                                        documents[candidateid])
            port = 1 - (dist / float(max(len_s, len_t)))
            candidate += ":" + str(port)
            sys.stdout.write("\t" + candidate)
        sys.stdout.write("\n")
def evaluation(submit: models.Submit):
    syntax_error_msg = ""

    type_ = None

    schema = submit.Question.Schema
    question = submit.Question
    answers = models.Answer.query.filter_by(idQuestion=question.id)
    student = submit.Student

    recover_schema(schema)

    path = os.path.join(config.save_db_path, student.id)
    if not os.path.exists(path):
        os.makedirs(path)
    path = os.path.join(path, schema.name)
    shutil.copyfile(schema.path, path)

    correct_sql, count_spelling_err, answer, correct = correct_spelling(
        submit.answer,
        list(answers),
        schema,
    )
    submit.correct = correct_sql
    submit.spelling = count_spelling_err
    submit.idAnswer = answer.id
    submit.Answer = answer

    conn = sqlite3.connect(path)
    cur = conn.cursor()
    try:
        cur.execute(correct_sql)
        values = cur.fetchall()
        result = {'data': values, 'len': len(values)}
        submit.result = json.dumps(result)
        result = json.loads(submit.result)
        if question.result is None:
            abort(500)
        else:
            origin = json.loads(question.result)
            if origin == result:
                submit.score = question.score - count_spelling_err
                submit.info = ' '.join(map(str,
                                           correct)) + '\n' + syntax_error_msg
                type_ = type_submit.all_right if count_spelling_err == 0 else type_submit.error_spelling
            else:
                type_ = type_submit.error_result
    except Exception as e:
        syntax_error_msg = str(e)
        submit.result = str(e)
        type_ = type_submit.error_syntax
    finally:
        cur.close()
        conn.close()

    submit.segmentJson = json.dumps({'compare': []})
    if type_ != type_submit.error_spelling and type_ != type_submit.all_right:
        submit.info = ' '.join(map(str, correct)) + '\n' + syntax_error_msg
        if type_ == type_submit.error_result:
            submit.score = question.score - count_spelling_err
            stu_segments = Segment(submit.correct)
            segments = models.Segmentation.query.filter_by(
                idAnswer=submit.Answer.id).order_by(models.Segmentation.rank)
            segments = [s for s in segments]
            submit.segmentJson = {'compare': []}
            idx_student_segment = 0
            idx_segment = 0
            while idx_segment < len(segments):
                compare = {'right_segment': segments[idx_segment].data}
                if idx_student_segment < len(
                        stu_segments.segment_str
                ) and Segment.filter_segment_punctuation(
                        segments[idx_segment].data
                ) == Segment.filter_segment_punctuation(
                        stu_segments.segment_str[idx_student_segment]):
                    compare['student_segment'] = stu_segments.segment_str[
                        idx_student_segment]
                    compare['deduction'] = 0
                    idx_student_segment += 1
                else:
                    tmp_idx = idx_student_segment
                    max_score = 0
                    max_idx = tmp_idx
                    while tmp_idx < len(stu_segments.segment_str):
                        score = Levenshtein.ratio(
                            Segment.filter_segment_punctuation(
                                segments[idx_segment].data),
                            Segment.filter_segment_punctuation(
                                stu_segments.segment_str[tmp_idx]))
                        if score > max_score:
                            max_idx = tmp_idx
                            max_score = score
                        tmp_idx += 1
                    if max_score < 0.6:
                        compare['student_segment'] = ''
                        compare['deduction'] = segments[idx_segment].score
                        submit.score -= segments[idx_segment].score
                    else:
                        compare['student_segment'] = stu_segments.segment_str[
                            max_idx]
                        while idx_student_segment < max_idx:
                            submit.segmentJson['compare'].append({
                                'student_segment':
                                stu_segments.segment_str[idx_student_segment],
                                'right_segment':
                                '',
                                'deduction':
                                2
                            })
                            submit.score -= 2
                            idx_student_segment += 1
                        idx_student_segment = max_idx + 1
                        if max_score == 1:
                            compare['deduction'] = 0
                        else:
                            compare['deduction'] = segments[idx_segment].score
                            submit.score -= segments[idx_segment].score

                idx_segment += 1
                submit.segmentJson['compare'].append(compare)
            while idx_student_segment < len(stu_segments.segment_str):
                submit.segmentJson['compare'].append({
                    'student_segment':
                    stu_segments.segment_str[idx_student_segment],
                    'right_segment':
                    '',
                    'deduction':
                    2
                })
                submit.score -= 2
                idx_student_segment += 1

        elif type_ == type_submit.error_syntax:
            submit.score = 0
            pass
    submit.score = 0 if submit.score < 0 else submit.score
    submit.segmentJson = json.dumps(submit.segmentJson)
    submit.type = type_.value
    os.remove(path)
def correct_spelling(stem, answers, schema):
    keywords_schema = schema.keywords.split(' ')
    keywords_schema.append('*')
    format_sql = sqlparse.format(stem, keyword_case='upper')
    correct = [0] * len(format_sql)
    format_sql += '\0'
    # segment_sql = re.split('[. \t\n]', format_sql)
    correct_sql = ''
    start_word_idx = 0
    count_spelling_err = 0
    keywords = [
        keywords_schema,
        list(sqlparse.keywords.KEYWORDS.keys()),
        list(sqlparse.keywords.KEYWORDS_COMMON.keys())
    ]
    for i in range(0, len(format_sql)):
        if format_sql[i] in (' ', '.', '\0', '=', '<', '>', '!', ',', ')',
                             '(') or format_sql[i].isdigit():
            word = format_sql[start_word_idx:i]

            if word.strip() != '' and word not in keywords[0] and word.upper(
            ) not in keywords[1] and word.upper() not in keywords[2]:
                count_spelling_err += 1
                max_word = ''
                max_value = 0
                done = False
                for idx in range(0, len(keywords)):
                    if idx > 1:
                        word = word.upper()
                    for j in range(0, len(keywords[idx])):
                        ratio = Levenshtein.ratio(word, keywords[idx][j])
                        if ratio > replace_threshold:
                            done = True
                            max_word = keywords[idx][j]
                            max_value = ratio
                            break
                        elif ratio > max_value:
                            max_word = keywords[idx][j]
                            max_value = ratio
                    if done:
                        break
                correct_sql += max_word
                correct = list(
                    map(
                        lambda idx, x: round(max_value, 3)
                        if start_word_idx <= idx < i else x,
                        range(0, len(correct)), correct))
            else:
                correct_sql += word

            if format_sql[i] != '\0':
                correct_sql += format_sql[i]
            start_word_idx = i + 1
        else:
            correct[i] = 1
    max_answer = answers[0]
    max_value = Levenshtein.ratio(answers[0].sql, correct_sql)
    for i in range(1, len(answers)):
        ratio = Levenshtein.ratio(answers[i].sql, correct_sql)
        if ratio > max_value:
            max_answer = answers[i]
            max_value = ratio

    return correct_sql, count_spelling_err, max_answer, correct
    def ambiguous_df_look_up(self, tag, recipe_ingredient, recipe_key):
        temp_nutritional_df = self.nutrition_init.NDB_NO_lookup(
            tag, filter_list=['Measure', 'Weight(g)'])

        filtered_ambiguous_df = self.ambiguous_df[
            self.ambiguous_df['NDB_NO'] == "\"{}\"".format(tag.strip('"'))]
        filtered_ambiguous_df = filtered_ambiguous_df.reset_index()

        recipe_ingredient_unit_dict = self.extact_unit_from_recipe(
            recipe_ingredient)
        amount_recipe, unit_recip = self.extact_number_from_recipe(
            recipe_ingredient, recipe_ingredient_unit_dict)
        unit_recip = recipe_key

        temp_recipe_ingredient = str(
            recipe_ingredient.replace(str(amount_recipe), "").lstrip(" "))

        itr = 0
        matching_levenstein_ratio_list = []

        if len(filtered_ambiguous_df) > 1:
            while itr < len(filtered_ambiguous_df):
                temp_ambiguos_ingredient = filtered_ambiguous_df.loc[
                    itr, 'Ingredient']
                temp_ambiguos_ingredient = re.sub(
                    r"([/.0-9]*)", "", temp_ambiguos_ingredient).lstrip(" ")

                matching_levenstein_ratio_list.append(
                    Levenshtein.ratio(temp_ambiguos_ingredient,
                                      temp_recipe_ingredient))
                itr += 1

            levenstein_index = matching_levenstein_ratio_list.index(
                max(matching_levenstein_ratio_list))
        elif len(filtered_ambiguous_df) == 1:
            levenstein_index = 0
        else:
            print("\t\tERROR", recipe_ingredient, tag, unit_recip,
                  self.food_unit_standard_dictionary[unit_recip]['type'])
            levenstein_index = 0

        if self.food_unit_standard_dictionary[unit_recip]['type'] == 'weight':
            temp_recipe_g = float(amount_recipe) * float(
                self.weight_unit_df['gram'])
            conversion_factor = float(temp_recipe_g) / float(
                temp_nutritional_df['Weight(g)'].get_values()[0])

        elif self.food_unit_standard_dictionary[unit_recip][
                'type'] == 'volume':
            temp_recipe_g = (float(amount_recipe) * float(
                self.volume_unit_df['cup'].get_values()[0])) / (
                    float(filtered_ambiguous_df.loc[levenstein_index, 'cups'])
                    * float(self.volume_unit_df[unit_recip].get_values()[0]))
            conversion_factor = (float(temp_recipe_g) * float(
                filtered_ambiguous_df.loc[levenstein_index, 'grams'])) / float(
                    temp_nutritional_df['Weight(g)'].get_values()[0])

        elif self.food_unit_standard_dictionary[unit_recip]['type'] == 'unit':
            temp_recipe_g = float(amount_recipe) * float(
                filtered_ambiguous_df.loc[levenstein_index, 'grams'])
            conversion_factor = float(temp_recipe_g) / float(
                temp_nutritional_df['Weight(g)'].get_values()[0])

        return conversion_factor
#res = {'relevance': [], 'pmid': [], 'title': [], 'url': [], 'year': [], 'cost': []}
res = {'relevance': [], 'pmid': [], 'title': [], 'cit_count': [], 'year': []}

# use Levenshtein distance to determine each article's relevance
#   to the search phrase. This could be replaced by something smarter.
it = 0
num_pmids = len(pmids)
for pmid in pmids:
    art = get_article(pmid)
    cits = get_citedby(pmid)

    it += 1
    latest_iteration.text(f"Getting PMID {pmid} (article {it}/{num_pmids})")
    bar.progress(100 * int((it / num_pmids)))

    res['relevance'].append(Levenshtein.distance(search, art.title))
    res['pmid'].append(pmid)
    res['title'].append(art.title)

    # lower cit count by 2 to compensate for self-mentions in the eutils XML return.
    res['cit_count'].append(len(cits) - 2)
    res['year'].append(art.year)
    #res['cost'].append(0)
    #res['url'].append(art.url)

if res:
    latest_iteration.text("Done! See results below.")
    # start already sorted on relevance
    df = pd.DataFrame(res)
    #st.dataframe(df)
    st.dataframe(df.style.highlight_max())
Beispiel #45
0
def _distance(str1, str2):
    distance = Levenshtein.distance(str1, str2)
    if len(str1) > len(str2):
        return 1 - float(distance) / len(str1)
    else:
        return 1 - float(distance) / len(str2)
Beispiel #46
0
def phoneme_error_rate(p_seq1, p_seq2):
    p_vocab = set(p_seq1 + p_seq2)
    p2c = dict(zip(p_vocab, range(len(p_vocab))))
    c_seq1 = [chr(p2c[p]) for p in p_seq1]
    c_seq2 = [chr(p2c[p]) for p in p_seq2]
    return Levenshtein.distance(''.join(c_seq1), ''.join(c_seq2)) / len(c_seq2)
Beispiel #47
0
import Levenshtein

a1 = '尊敬的客户,您好!建议您可以选择考虑我司康爱卫士老年防癌疾病有关产品。您可以登陆 http://baoxian.cntaiping.com/(太平网上商城)或拨打 400-868-8888(太平电话销售);选择您需要的产品购买。祝您太平幸福!'
a2 = '尊敬的客户,您好!建议您可以选择考虑我司康爱卫士老年防癌疾病有关产品。您可以登陆 http://baoxian.cntaiping.com/(太平网上商城)或拨打 400-868-8888(太平电话销售);选择您需要的产品购买。祝您太平幸福!'
print(Levenshtein.distance(a1, a2))
Beispiel #48
0
 def _lev_tok_similarity(self, source_tok, target_tok):
     if len(source_tok) == 0 or len(target_tok) == 0:
         return 0
     return 1 - (py_lev.distance(source_tok, target_tok) /
                 max([len(source_tok), len(target_tok)]))
    def spell_correctness2(self, ask, dictionary):
        final_candidate_str = []
        final_candidate_ratio = []

        for word in ask.split(' '):
            candidate_str = []
            candidate_ratio = []
            for dict in dictionary:
                if word in dict:
                    candidate_str.append(str(dict))
                    candidate_ratio.append(Levenshtein.ratio(word, str(dict)))
            #print "1 ",candidate_str
            #print "1 ",candidate_ratio

            temp_candidate_str = candidate_str
            temp_candidate_ratio = candidate_ratio
            candidate_str = []
            candidate_ratio = []
            i = 0
            for item in temp_candidate_str:
                if (word + " ") in item or (" " + word) in item:
                    candidate_str.append(item)
                    candidate_ratio.append(temp_candidate_ratio[i])
                i = i + 1

            #print "2 ",candidate_str
            #print "2 ",candidate_ratio

            if len(candidate_str) == 1:
                final_candidate_str.append(candidate_str[0])
                final_candidate_ratio.append(candidate_ratio[0])
            elif len(candidate_str) > 1:
                x = 0
                j = 0
                i = 0
                for item in candidate_str:
                    if x < candidate_ratio[i]:
                        x = candidate_ratio[i]
                        j = i
                    i = i + 1
                final_candidate_str.append(candidate_str[j])
                final_candidate_ratio.append(candidate_ratio[j])
            elif len(candidate_str) == 0:
                if len(word) > 3 and len(temp_candidate_str) > 0:
                    x = 0
                    j = 0
                    i = 0
                    for item in temp_candidate_str:
                        if x < temp_candidate_ratio[i]:
                            x = temp_candidate_ratio[i]
                            j = i
                        i = i + 1
                    final_candidate_str.append(temp_candidate_str[j])
                    final_candidate_ratio.append(temp_candidate_ratio[j])
                else:
                    final_candidate_str.append(word)
                    final_candidate_ratio.append(0)

        #print ">>",final_candidate_str
        #print ">>",final_candidate_ratio

        s = ''
        x = ''
        for item in final_candidate_str:
            if x != item:
                s = s + item + ' '
            x = item
        return s
Beispiel #50
0
def levenshtein_distance(str1, str2):
    dist = Levenshtein.distance(str1, str2)
    longer_str = str1 if len(str1) > len(str2) else str2
    score = float(dist) * 100 / len(longer_str)
    return score
    def spell_correctness(self, ask):
        fuzzy_str = {}
        fuzzy_ratio = {}
        new_ask = {}
        fuzzy_str2 = {}
        fuzzy_ratio2 = {}
        new_ask2 = {}

        i = 0
        for word in ask.split(' '):
            ratio = 0
            for pool in self.pool_xtrans:

                if pool.find(word) != -1 and len(word) > 3:
                    x = 0.71
                else:
                    x = Levenshtein.ratio(word, pool)

                if ratio < x:
                    ratio = x
                    fuzzy_str[i] = pool
                    fuzzy_ratio[i] = x
            new_ask[i] = word
            if fuzzy_ratio[i] > 0.7:
                new_ask[i] = fuzzy_str[i]

            i = i + 1
            #print "     answer", fuzzy_str
        #print "     ratio", fuzzy_ratio
        #print "     new ask", new_ask

        #print "-------------------------------"

        #try using bigrams
        bigram = ngrams(ask.split(), 2)
        fuzzy_str2 = {}
        fuzzy_ratio2 = {}
        new_ask2 = {}
        i = 0
        for grams in bigram:
            token = ' '.join(grams)

            ratio = 0
            for pool in self.pool_xtrans:
                x = Levenshtein.ratio(token, pool)
                if ratio < x:
                    ratio = x
                    fuzzy_str2[i] = pool
                    fuzzy_ratio2[i] = x
            new_ask2[i] = token
            if fuzzy_ratio2[i] > 0.7:
                new_ask2[i] = fuzzy_str2[i]

            i = i + 1
        #print "     answer", fuzzy_str2
        #print "     ratio", fuzzy_ratio2
        #print "     new ask", new_ask2

        j = 0
        i = 0
        ask_correction = {}
        ask_array = ask.split(' ')
        words_count = len(ask.split(' '))
        while (i < words_count):
            if i < (words_count - 1):
                if fuzzy_ratio[i] <= 0.7 and fuzzy_ratio2[i] <= 0.7:
                    ask_correction[j] = ask_array[i]
                elif fuzzy_ratio[i] <= 0.7 and fuzzy_ratio2[i] > 0.7:
                    ask_correction[j] = fuzzy_str2[i]
                    i = i + 1
                elif fuzzy_ratio[i] > 0.7 and fuzzy_ratio2[i] <= 0.7:
                    ask_correction[j] = fuzzy_str[i]
                elif fuzzy_ratio[i] > 0.7 and fuzzy_ratio2[i] > 0.7:
                    ask_correction[j] = fuzzy_str[i]
                    if fuzzy_ratio2[i] > fuzzy_ratio[i]:
                        ask_correction[j] = fuzzy_str2[i]
            else:
                if fuzzy_ratio[i] <= 0.7:
                    ask_correction[j] = ask_array[i]
                else:
                    ask_correction[j] = fuzzy_str[i]
            i = i + 1
            j = j + 1

        #print ask_correction
        s = ''
        for key, value in ask_correction.iteritems():
            s = s + value + ' '
        #print s
        return s
Beispiel #52
0
def analysisQuestion(question, phrase):
    if question[-1] == '?':
        question = question[:-1]
    #print('analyzing...')
    print(phrase)
    candidateResource = []
    candidateProperty = []
    p = phrase[-1]
    if p in enResourceDic:
        candidateResource.append(ResourceData(p, 1, 1, 0))
    if p in zhResourceDic:
        candidateResource.append(ResourceData(p, 1, 1, 1))
    # if p in frResourceDic:
    #     candidateResource.append(ResourceData(p, 1, 1, 2))
    #print(candidateResource)
    resource = []
    ph = phrase[:-1]
    for w in ph:
        resource.append(w.replace(' ', ''))
    #En->Zh
    translist = transList(set(ph), 'en', 'cht')
    parse = list(set(resource + translist + ph))
    print(parse)
    #print('trans Done')
    dis = []
    for p in parse:
        f = 0
        for d in dis:
            if d == p:
                f = 1
        if f == 0:
            dis.append(p)
    parse = dis
    # print(candidateResource)
    # print(parse)

    samePharse = []
    for p in parse:
        num = 0
        same = []
        if p == '':
            continue
        # print(p)
        for pro in enPropertyList:
            con = Levenshtein.ratio(pro, p)
            if con > ThresHold:
                new = ResourceData(pro, con, 1, 0)
                candidateProperty.append(new)
                same.append(new)
                num += 1
        for pro in zhPropertyList:
            con = Levenshtein.ratio(pro, p)
            if con > zhThresHold:
                new = ResourceData(pro, con, 1, 1)
                candidateProperty.append(new)
                same.append(new)
                num += 1
        # for pro in frPropertyList:
        #     con = Levenshtein.ratio(pro, p)
        #     if con > ThresHold:
        #         new = ResourceData(pro, con, 1, 2)
        #         candidateProperty.append(new)
        #         same.append(new)
        #         num += 1
        # for res in frKG:
        #     con0=Levenshtein.ratio(res[0], p)
        #     if con0>ThresHold:
        #         new=ResourceData(res[0], con0, 1, 2)
        #         candidateResource.append(new)
        #         same.append(new)
        #         num += 1
        #     con1=Levenshtein.ratio(res[1], p)
        #     if con1>ThresHold:
        #         new=ResourceData(res[1], con1, 1, 2)
        #         candidateProperty.append(new)
        #         same.append(new)
        #         num += 1
        #     con2 = Levenshtein.ratio(res[2], p)
        #     if con2>ThresHold:
        #         new=ResourceData(res[2], con2, 1, 2)
        #         candidateResource.append(new)
        #         same.append(new)
        #         num += 1
        if num > 1:
            samePharse.append(same)
    cr = []
    cp = []

    for c in candidateResource:
        f = 0
        for i in cr:
            if i == c:
                f = 1
        if f == 1:
            continue
        else:
            cr.append(c)
    for c in candidateProperty:
        f = 0
        for i in cp:
            if i == c:
                f = 1
        if f == 1:
            continue
        else:
            cp.append(c)
    candidate = [cr, cp]
    sameP = []
    for s in samePharse:
        sset = []
        for c in s:
            f = 0
            for i in sset:
                if i == c:
                    f = 1
            if f == 1:
                continue
            else:
                sset.append(c)
        sameP.append(sset)
    #print('------resource analyze!------')
    # for i in cr:
    #     i.print()
    #print('-----property----')
    # for i in cp:
    #     i.print()
    # print('analyzing done!')
    # print('---------same phrase--------')
    # for i in sameP:
    #     print('same:')
    #     for s in i:
    #         print(str(s))
    return sameP, candidate
def calculate_compile_distance(shopname):
    distances = []
    for meb in data:
        distances.append(Levenshtein.distance(shopname, meb))
    return distances
Beispiel #54
0
 def calculate_levenshtein_distance(self, s1, s2):
     return Levenshtein.ratio(s1, s2)
def main(dirname):
    df = pd.read_excel(
        '/Users/nakamurasatoru/git/d_genji/kouigenjimonogatari.github.io/src/data/metadata.xlsx',
        header=None,
        index_col=None,
        engine="openpyxl")

    configs = {}

    for i in range(len(df.index)):

        uri = df.iloc[i, 0]
        if not pd.isnull(uri):
            row_num = df.iloc[i, 2]
            if int(row_num) == 1:
                title = df.iloc[i, 3]
                vol = df.iloc[i, 6]
                page = df.iloc[i, 1]

                if vol not in configs:
                    configs[vol] = {"data": {}}

                configs[vol]["data"][title] = page

    for vol in configs:

        config = configs[vol]

        koui = config["data"]

        VOL = str(vol).zfill(2)
        '''
        if VOL != "51" and False:
            continue
        '''

        print(VOL)

        path = '../../docs/iiif/' + dirname + '/' + VOL + '.json'

        if not os.path.exists(path):
            continue

        with open(path) as f:
            df = json.load(f)
            members = df["selections"][0]["members"]

        ################## マッチング

        map = {}

        indexedObj = {}

        for line in koui:
            map[line] = []
            for i in range(len(members)):
                label = ""

                # -1行
                if i - 1 >= 0:
                    label += members[i - 1]["label"] + "/"

                # 該当行
                member = members[i]
                label += member["label"]

                # +1行
                '''
                if i + 1 <= len(members) - 1:
                    label += "/" + members[i+1]["label"]
                '''

                score = Levenshtein.distance(line, label.replace("/", ""))
                score = score / max(len(line), len(label.replace("/",
                                                                 "")))  # 正規化

                obj = {
                    "label": label,
                    "main": member["label"],
                    "score": score,
                    "member_id": member["@id"],
                    "index": i
                }

                map[line].append(obj)

                indexedObj[i] = obj

        ################## 集計

        prev_index = 0

        # 校異のライン毎に
        for line in map:
            print(str(koui[line]) + "\t" + line)

            obj = map[line]

            # 部分取得
            # obj = obj[prev_index:]

            # スコアが小さい順に並び替え
            score_sorted = sorted(obj, key=lambda x: x["score"])

            flg = True

            for i in range(len(score_sorted)):

                data = score_sorted[i]

                index = data["index"]
                '''
                if i < 10:
                    print(i, data["index"], data["score"], data["member_id"].split("/canvas/")[1], data["label"])
                '''

                # if index - prev_index < 50:

                if flg:
                    # print("******:")
                    prev_index = index + 1

                    # if prev_index - 1 < len(obj):
                    #    data = obj[prev_index - 1]

                    index = data["index"]
                    if index > 0:
                        data = indexedObj[index - 1]

                    table = '''
                    <table class="table">
                        <tr>
                            <th>項目</th>
                            <th>値</th>
                        </tr>
                        <tr>
                            <td>大成番号</td>
                            <td>''' + str(koui[line]) + '''</td>
                        </tr>
                        <tr>
                            <td>校異源氏テキスト</td>
                            <td>''' + line + '''</td>
                        </tr>
                        <tr>
                            <td>KuroNet翻刻</td>
                            <td>''' + data["main"] + '''</td>
                        </tr>
                        <tr>
                            <td>KuroNet翻刻(前後を含む3行)</td>
                            <td>''' + data["label"] + '''</td>
                        </tr>
                    </table>
                    '''

                    ########### マーカーのためのID作成

                    member_id = data["member_id"]

                    # member_id = member["@id"]
                    sss = member_id.split("#xywh=")

                    canvas_id = sss[0]
                    xywh = sss[1].split(",")

                    d = 5

                    y = int(int(xywh[1]) * d / (d + 1))

                    if y == 0:
                        y = 800

                    w = 1

                    x = int(xywh[0]) + int(int(xywh[2]) / 2)

                    member_id = canvas_id + "#xywh=" + str(x) + "," + str(
                        y) + "," + str(w) + ",1"

                    ###########

                    members.append({
                        "@id":
                        member_id,
                        "@type":
                        "sc:Canvas",
                        "description":
                        "",
                        "label":
                        "[" + str(len(members) + 1) + "]",
                        "metadata": [{
                            "label": "p",
                            "value": koui[line]
                        }, {
                            "label": "校異源氏テキスト",
                            "value": line
                        }, {
                            "label": "KuroNet翻刻",
                            "value": data["main"]
                        }, {
                            "label": "KuroNet翻刻(前行を含む)",
                            "value": data["label"]
                        }, {
                            "label":
                            "Annotation",
                            "value": [{
                                "@id": member_id,
                                "@type": "oa:Annotation",
                                "motivation": "sc:painting",
                                "resource": {
                                    "@type": "cnt:ContentAsText",
                                    "chars": table,
                                    "format": "text/html",
                                    "marker": {
                                        "border-color":
                                        "red",
                                        "@type":
                                        "dctypes:Image",
                                        "@id":
                                        "https://nakamura196.github.io/genji_curation/icon/red.png#xy=16,16"
                                    }
                                },
                                "on": member_id
                            }]
                        }]
                    })

                    flg = False

            print("----------------")

        curation = {
            "@context": [
                "http://iiif.io/api/presentation/2/context.json",
                "http://codh.rois.ac.jp/iiif/curation/1/context.json"
            ],
            "@id":
            df["@id"],
            "@type":
            "cr:Curation",
            "label":
            "Character List",
            "selections": [{
                "@id": df["@id"] + "/range1",
                "@type": "sc:Range",
                "label": "Characters",
                "members": members,
                "within": df["selections"][0]["within"]
            }]
        }

        path = path.replace("_kuronet/", "_kuronet_taisei_all/")
        dirpath = os.path.dirname(path)
        os.makedirs(dirpath, exist_ok=True)

        f2 = open(path, 'w')
        json.dump(curation,
                  f2,
                  ensure_ascii=False,
                  indent=4,
                  sort_keys=True,
                  separators=(',', ': '))
Beispiel #56
0

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)


data = open('0643/SHEFFIELDDAT.643').read()

corrections = [
    re.sub('\s+', ' ', item).split(' ') for item in data.split('\n')
][:-1]

num = 0
correct = 0
for correction in corrections:
    if lev.distance(correction[0], correction[1]) > 2:
        print 'Distance greater than 2', correction
        continue
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(('127.0.0.1', 11000))
    s.sendall(correction[1] + '<EOF>')
    res = s.recv(8000)
    num += 1
    if res == 'Not Found':
        res = ''

    corrected1 = set(res.split())
    corrected2 = known_edits2(correction[1].lower())

    difference = corrected1 - corrected2
    if len(difference) > 0:
x_all['omsk_dist'] = omsk_dist
x_all['khabarovsk_dist'] = khabarovsk_dist
x_all['klyuchi_dist'] = klyuchi_dist
x_all['norilsk_dist'] = norilsk_dist
a.print_elapsed(start)

## Levenshtein features
print(a.c.BOLD + 'Finding levenshtein features ...' + a.c.END)

start = time.time()
print('Finding Levenshtein distance between titles ... ', end='', flush=True)
vals = []
data = df[['title_1', 'title_2']].values.tolist()
for x in data:
    #vals.append(levenshtein(str(x[0]), str(x[1])))
    vals.append(Levenshtein.distance(str(x[0]), str(x[1])))
x_all['title_lev'] = vals
a.print_elapsed(start)

start = time.time()
print('Finding total title length ... ', end='', flush=True)
vals = []
data = df[['title_1', 'title_2']].values.tolist()
for x in data:
    vals.append(len(str(x[0])) + len(str(x[1])))
title_tot_len = vals
a.print_elapsed(start)

start = time.time()
print('Finding normalised Levenshtein distance ...', end='', flush=True)
x_all['title_lev_norm'] = x_all['title_lev'] / title_tot_len
    def fit(self, dataset, knowledge_base):
        train_set = []
        train_labels = []
        for mention in dataset.mentions:
            context_1 = [x.lower().strip() for x in mention.contexts[0]]
            context_2 = [x.lower().strip() for x in mention.contexts[1]]

            word_emb = np.zeros((300, ))
            for word in mention.contexts[0]:
                word_emb += self.word2embed.get(word, np.zeros(300, ))
            for word in mention.contexts[1]:
                word_emb += self.word2embed.get(word, np.zeros(300, ))

            for candidate in mention.candidates:
                feat = [
                    candidate.prob,
                    math.exp(
                        -Levenshtein.distance(candidate.name, mention.surface))
                ]

                words = candidate.name.lower().split()
                context_sim_1 = sum(
                    [1 if x.strip() in context_1 else 0 for x in words])
                context_sim_2 = sum(
                    [1 if x.strip() in context_2 else 0 for x in words])
                feat.append(context_sim_1 / len(words))
                feat.append(context_sim_2 / len(words))

                ent = '_'.join(candidate.name.split(' '))
                ent_emb = self.ent2embed[ent]
                feat.extend(ent_emb)
                feat.extend(word_emb)
                feat.append(cosine_sim_vectors(word_emb, ent_emb))

                max_sim_1 = 0
                max_sim_2 = 0
                if candidate.id in knowledge_base.documents:
                    for section in knowledge_base.documents[
                            candidate.id].sections:
                        for sentence in section:
                            s = sentence.lower()
                            tmp_sim_1 = sum([
                                1 if x.strip() in s else 0 for x in context_1
                            ])
                            tmp_sim_2 = sum([
                                1 if x.strip() in s else 0 for x in context_2
                            ])
                            if tmp_sim_1 > max_sim_1:
                                max_sim_1 = tmp_sim_1
                            if tmp_sim_2 > max_sim_2:
                                max_sim_2 = tmp_sim_2
                feat.append(max_sim_1 /
                            len(context_1) if len(context_1) > 0 else 0)
                feat.append(max_sim_2 /
                            len(context_2) if len(context_2) > 0 else 0)

                train_set.append(feat)
                train_labels.append(1 if mention.gt.id == candidate.id else 0)

        train_set = np.array(train_set)
        train_labels = np.array(train_labels)

        self.net.fit(train_set,
                     train_labels,
                     epochs=10,
                     batch_size=100,
                     verbose=False)
    def predict(self, dataset, knowledge_base):
        pred_cids = []
        for mention in dataset.mentions:
            context_1 = [x.lower().strip() for x in mention.contexts[0]]
            context_2 = [x.lower().strip() for x in mention.contexts[1]]

            word_emb = np.zeros(300, )
            for word in mention.contexts[0]:
                word_emb += self.word2embed.get(word, np.zeros(300, ))
            for word in mention.contexts[1]:
                word_emb += self.word2embed.get(word, np.zeros(300, ))

            dev_set = []
            for candidate in mention.candidates:
                feat = [
                    candidate.prob,
                    math.exp(
                        -Levenshtein.distance(candidate.name, mention.surface))
                ]

                words = candidate.name.lower().split()
                context_sim_1 = sum(
                    [1 if x.strip() in context_1 else 0 for x in words])
                context_sim_2 = sum(
                    [1 if x.strip() in context_2 else 0 for x in words])
                feat.append(context_sim_1 / len(words))
                feat.append(context_sim_2 / len(words))

                ent = '_'.join(candidate.name.split(' '))
                ent_emb = self.ent2embed.get(ent, np.zeros(300, ))
                feat.extend(ent_emb)
                feat.extend(word_emb)
                feat.append(cosine_sim_vectors(word_emb, ent_emb))

                max_sim_1 = 0
                max_sim_2 = 0
                if candidate.id in knowledge_base.documents:
                    for section in knowledge_base.documents[
                            candidate.id].sections:
                        for sentence in section:
                            s = sentence.lower()
                            tmp_sim_1 = sum([
                                1 if x.strip() in s else 0 for x in context_1
                            ])
                            tmp_sim_2 = sum([
                                1 if x.strip() in s else 0 for x in context_2
                            ])
                            if tmp_sim_1 > max_sim_1:
                                max_sim_1 = tmp_sim_1
                            if tmp_sim_2 > max_sim_2:
                                max_sim_2 = tmp_sim_2
                feat.append(max_sim_1 /
                            len(context_1) if len(context_1) > 0 else 0)
                feat.append(max_sim_2 /
                            len(context_2) if len(context_2) > 0 else 0)

                dev_set.append(feat)

            dev_set = np.array(dev_set)
            if mention.candidates:
                pred = self.net.predict_proba(dev_set)
                pred_cids.append(mention.candidates[np.argmax(pred)].id)
            else:
                pred_cids.append('NIL')
        return pred_cids
Beispiel #60
0
    b = (norm(ans[i]-A[i]) + norm(ans[i]-B[i]) + norm(ans[i]-C[i]) + norm(ans[i]-D[i])) / 3.0
    cos_ans.append(a)
    euc_dis.append(b)

cos_ans = list(np.array(cos_ans, dtype = 'float64'))
euc_dis = list(np.array(euc_dis, dtype = 'float64'))
data["cos_sim"] = cos_ans
data["euc_dis"] = euc_dis
del(cos_ans, euc_dis, a, b)
#-----------------------------------------------------------------------------#


#-----------------------------------------------------------------------------#
# get levenshtein distance
import Levenshtein as lv
LdistA = [lv.distance(data.iloc[i,1], data.iloc[i,2]) for i in range(data.shape[0])]
LdistB = [lv.distance(data.iloc[i,1], data.iloc[i,3]) for i in range(data.shape[0])]
LdistC = [lv.distance(data.iloc[i,1], data.iloc[i,4]) for i in range(data.shape[0])]
LdistD = [lv.distance(data.iloc[i,1], data.iloc[i,5]) for i in range(data.shape[0])]
data['Ldist'] = np.sum(np.transpose(np.array([(LdistA), (LdistB), (LdistC), (LdistD)])), axis=1)/4
del(LdistA, LdistB, LdistC, LdistD)
#-----------------------------------------------------------------------------#


#-----------------------------------------------------------------------------#
# 답과 보기의 태그를 비교하자
tags_diff = [] # 0 = false (tags are the same)
for i in range(data.shape[0]):
    # create sentences with choices
    ans_s = data.iloc[i,0].replace('_____', data.iloc[i,1])
    A_s = data.iloc[i,0].replace('_____', data.iloc[i,2])