Esempio n. 1
0
    def get(self, snapa, snapb):
        sa = Snap.objects.get_or_404(id=snapa)
        sb = Snap.objects.get_or_404(id=snapb)

        fstcleanhtml = cleanhtml(sa.html)
        sndcleanhtml = cleanhtml(sb.html)
        sm = SequenceMatcher(None,
                             sndcleanhtml,
                             fstcleanhtml)
        txtinsert = []
        txtdel = []
        txtreplace = []
        for tag, i1, i2, j1, j2 in sm.get_opcodes():
            if tag == "replace":
                txtreplace.append(
                    ("%s <-> %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
            if tag == "insert":
                txtinsert.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
            if tag == "delete":
                txtdel.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip())
        return jsonify({
            'diff': {'fst': {'id': str(sa.id), 'dthr': sa.dthr},
                     'snd': {'id': str(sb.id), 'dthr': sb.dthr},
                     'ratio': sm.ratio(),
                     'insert': txtinsert,
                     'replace': txtreplace,
                     'delete': txtdel}
        })
Esempio n. 2
0
def partial_ratio(s1, s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""
    s1, s2 = utils.make_type_consistent(s1, s2)

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return utils.intr(100 * max(scores))
Esempio n. 3
0
def colordiff(a, b, highlight='red'):
    """Given two strings, return the same pair of strings except with
    their differences highlighted in the specified color.
    """
    a_out = []
    b_out = []
    
    matcher = SequenceMatcher(lambda x: False, a, b)
    for op, a_start, a_end, b_start, b_end in matcher.get_opcodes():
        if op == 'equal':
            # In both strings.
            a_out.append(a[a_start:a_end])
            b_out.append(b[b_start:b_end])
        elif op == 'insert':
            # Right only.
            b_out.append(colorize(highlight, b[b_start:b_end]))
        elif op == 'delete':
            # Left only.
            a_out.append(colorize(highlight, a[a_start:a_end]))
        elif op == 'replace':
            # Right and left differ.
            a_out.append(colorize(highlight, a[a_start:a_end]))
            b_out.append(colorize(highlight, b[b_start:b_end]))
        else:
            assert(False)
    
    return ''.join(a_out), ''.join(b_out)
Esempio n. 4
0
def ratio(s1,  s2):

    if s1 is None: raise TypeError("s1 is None")
    if s2 is None: raise TypeError("s2 is None")

    m = SequenceMatcher(None, s1, s2)
    return int(100 * m.ratio())
Esempio n. 5
0
def dif_html(a, b, lower_criteria = 0.5, upper_criteria = 0.6,
             link_criteria = 0.4, img_criteria = 0.2, script_criteria = 0.2):
    from urllib2 import urlopen 
    from difflib import SequenceMatcher
    from my_stats import jaccard as jac

    #詳細にページをみる
    try:
        html_a, html_b = urlopen(a).read(), urlopen(b).read()
        matcher = SequenceMatcher()
        matcher.set_seq1(html_a)
        matcher.set_seq2(html_b)
        if matcher.ratio() >= lower_criteria:
            #lower_criteria以上だけどupper_criteria以下の場合にはリンク構造など詳しく調査する
            if matcher.ratio() >= upper_criteria:
                print "white", matcher.ratio(), upper_criteria, lower_criteria, a, b
                return True
            else:
                #ここにnon-negative-matrix-factorizationを入れてより精緻に分析する予定だったけど、まだあまりうまく行かないのでペンディング(T.B.D.)
                print "grey", matcher.ratio(), upper_criteria, lower_criteria, a, b
                links_a, titles_a, imgs_a, scripts_a = analyze_html(html_a)
                links_b, titles_b, imgs_b, scripts_b = analyze_html(html_b)
                j_links, j_imgs, j_scripts = jac(links_a, links_b), jac(imgs_a, imgs_b), jac(imgs_a, imgs_b)
                if j_links >= link_criteria and j_imgs >= img_criteria and j_scripts >= script_criteria:
                    return True
                else:
                    return False
        #lower_criteria以上に似てない部分がある場合には似てないと判断する
        else:
            print "black", matcher.ratio(), upper_criteria, lower_criteria, a, b
            return False
    except:
        #クローリングできない場合には異なるページと見なす
        return False
Esempio n. 6
0
    def controlled_vocab_lookup(self, controlled_vocab, search_term):
        """ Performs a semi-fuzzy search for a term match in specified vocabulary
        """
        search_term = search_term
        best_ratio = 0
        best_term = None
        minimum_ratio = 0.8
        return_value = None

        for term in controlled_vocab:
            # Exact match - exit with value
            if search_term == term:
                return search_term
            elif term.lower() in search_term.lower() or search_term.lower() in term.lower():
                return search_term

            # Let's see how similar the strings are
            s = SequenceMatcher(None, search_term.lower(), term.lower())
            ratio = s.ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_term = term

        # Examine ratio/term and see if we have anything reasonable
        if best_ratio >= minimum_ratio:
            return_value = best_term

        return return_value
def _test_xml_diff(result, expected):
    """Compare two XML strings by using python's ``difflib.SequenceMatcher``.

    This is a character-by-character comparison and does not take into account
    the semantic meaning of XML elements and attributes.

    Parameters
    ----------
    result: str
        The result of running the test.
    expected: str
        The expected outcome.

    Returns
    -------
    bool
        Whether the result matches the expectations or not.

    """

    sequence_matcher = SequenceMatcher(None, result, expected)
    ratio = sequence_matcher.ratio()
    matches = ratio == pytest.approx(1.0)
    if not matches:
        print("Result does not match expected.")
        diff = unified_diff(result.splitlines(), expected.splitlines())
        print("\n".join(list(diff)))
    return matches
Esempio n. 8
0
        def _blobs_similarity(removed_blob, added):
            best = dict(ratio=0, name='', blob=None)
            for added_name in added:
                added_blob = self.tree.get_obj_by_path(added_name)
                if not isinstance(added_blob, Blob):
                    continue
                diff = SequenceMatcher(None, removed_blob.text,
                                       added_blob.text)
                ratio = diff.quick_ratio()
                if ratio > best['ratio']:
                    best['ratio'] = ratio
                    best['name'] = added_name
                    best['blob'] = added_blob

                if ratio == 1:
                    break  # we'll won't find better similarity than 100% :)

            if best['ratio'] > DIFF_SIMILARITY_THRESHOLD:
                diff = ''
                if best['ratio'] < 1:
                    added_blob = best['blob']
                    rpath = ('a' + removed_blob.path()).encode('utf-8')
                    apath = ('b' + added_blob.path()).encode('utf-8')
                    diff = ''.join(unified_diff(list(removed_blob),
                                                list(added_blob),
                                                rpath, apath))
                return dict(new=best['name'],
                            ratio=best['ratio'], diff=diff)
Esempio n. 9
0
def calc_similarity(s_standard, s_candidate):
    if s_standard is None or s_candidate is None:
        return 0

    m = SequenceMatcher(None, s_standard, s_candidate)
    if len(s_standard) >= len(s_candidate):
        return m.ratio()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    blocks = m.get_matching_blocks()
    scores = []
    for block in blocks:
        start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        end = start + len(s_standard)
        s_sub = s_candidate[start:end]

        m = SequenceMatcher(None, s_standard, s_sub)
        scores.append(m.ratio())

    return max(scores)
Esempio n. 10
0
def highlighted_ndiff(a, b):
    """Returns a highlited string, with bold charaters where different."""
    s = ''
    sm = SequenceMatcher()
    sm.set_seqs(a, b)
    linesm = SequenceMatcher()
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == REPLACE:
            for aline, bline in zip_longest(a[i1:i2], b[j1:j2]):
                if bline is None:
                    s += redline(aline)
                elif aline is None:
                    s += greenline(bline)
                else:
                    s += bold_str_diff(aline, bline, sm=linesm)
        elif tag == DELETE:
            for aline in a[i1:i2]:
                s += redline(aline)
        elif tag == INSERT:
            for bline in b[j1:j2]:
                s += greenline(bline)
        elif tag == EQUAL:
            for aline in a[i1:i2]:
                s += '  ' + aline + '\n'
        else:
            raise RuntimeError('tag not understood')
    return s
Esempio n. 11
0
def get_music(a, b, key='C', mode='major'):
    midi_out = StringIO()

    scale = build_scale(key, mode, octaves=1)
    matcher = SequenceMatcher(None, a, b)

    tone = key.lower()
    melodies = [tone]
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        next_note = None
        if tag == 'replace':
            next_note = 'r'
        elif tag == 'equal':
            next_note = tone
        elif tag == 'delete':
            tone = tone_down(tone, scale)
            next_note = tone
        elif tag == 'insert':
            tone = tone_up(tone, scale)
            next_note = tone
        melodies += [next_note] * ((i2 - i1) or 1)
    s = SMF([parse(" ".join(melodies))])

    s.write(midi_out)
    return midi_out
Esempio n. 12
0
def handle_redirects(queued, target):
    """ This call is used to determine if a suggested redirect is valid.
    if it happens to be, we change the url entry with the redirected location and add it back
    to the call stack. """
    retry_count = queued.get("retries")
    if retry_count and retry_count > 1:
        return
    elif not retry_count:
        queued["retries"] = 0

    parsed_taget = urlparse(target)
    target_path = parsed_taget.path

    source_path = conf.target_base_path + queued.get("url")
    textutils.output_debug("Handling redirect from: " + source_path + " to " + target_path)

    matcher = SequenceMatcher(isjunk=None, a=target_path, b=source_path, autojunk=False)
    if matcher.ratio() > 0.8:
        queued["url"] = target_path
        queued["retries"] += 1
        # Add back the timed-out item
        textutils.output_debug("Following redirect! " + str(matcher.ratio()))
        database.fetch_queue.put(queued)
    else:
        textutils.output_debug("Bad redirect! " + str(matcher.ratio()))
Esempio n. 13
0
    def remove_duplicates_stable(movies):
        nodups = []
        for movie in movies:
            too_similar = False
            for existing_movie in nodups:
                matcher = SequenceMatcher(a=movie["name"], b=existing_movie["name"])
                is_over_95_percent_similar = matcher.ratio() > 0.95

                is_prefix_of_existing = movie["name"].startswith(existing_movie["name"]) or existing_movie[
                    "name"
                ].startswith(movie["name"])

                if is_over_95_percent_similar or is_prefix_of_existing:

                    # Does the movie exist, but with a different year?
                    if not movie["year"] or movie["year"] == existing_movie["year"]:
                        too_similar = True
                        break

            if not too_similar:
                nodups.append(movie)

            elif len(movie["name"]) > len(existing_movie["name"]):
                nodups.remove(existing_movie)
                nodups.append(movie)

        return nodups
Esempio n. 14
0
def test_valid_result(content):
    is_valid_result = True

    # Encoding edge case
    # Must be a string to be compared to the 404 fingerprint
    if not isinstance(content, str):
        content = content.decode('utf-8', 'ignore')

    if not len(content):
        content = ""  # empty file, still a forged 404
    elif len(content) < conf.file_sample_len:
        content = content[0:len(content) - 1]
    else:
        content = content[0:conf.file_sample_len - 1]

    # False positive cleanup for some edge cases
    content = content.strip('\r\n ')

    # Test signatures
    for fingerprint in database.crafted_404s:
        textutils.output_debug("Testing [" + content + "]" + " against Fingerprint: [" + fingerprint + "]")
        matcher = SequenceMatcher(isjunk=None, a=fingerprint, b=content, autojunk=False)

        textutils.output_debug("Ratio " + str(matcher.ratio()))

        # This content is almost similar to a generated 404, therefore it's a 404.
        if matcher.ratio() > 0.8:
            textutils.output_debug("False positive detected!")
            is_valid_result = False
            break

    return is_valid_result
Esempio n. 15
0
def test_valid_result(content):
    is_valid_result = True

    # Tweak the content len
    if len(content) > conf.file_sample_len:
        content = content[0 : conf.file_sample_len - 1]

    # False positive cleanup for some edge cases
    content = content.strip("\r\n ")

    # Test signatures
    for fingerprint in database.crafted_404s:
        textutils.output_debug(
            "Testing [" + content.encode("hex") + "]" + " against Fingerprint: [" + fingerprint.encode("hex") + "]"
        )
        matcher = SequenceMatcher(isjunk=None, a=fingerprint, b=content, autojunk=False)

        textutils.output_debug("Ratio " + str(matcher.ratio()))

        # This content is almost similar to a generated 404, therefore it's a 404.
        if matcher.ratio() > 0.8:
            textutils.output_debug("False positive detected!")
            is_valid_result = False
            break

    return is_valid_result
Esempio n. 16
0
def html_diff(str1, str2, max_lenght=80, html_same_class="blue", html_diff_class="red"):
    from difflib import SequenceMatcher
    str1, str2 = align_strings(str1, str2, max_lenght)

    sm = SequenceMatcher(lambda x: x in " ")

    same_span = "<span style='color: %s'>" % html_same_class
    diff_span = "<span style='color: %s'>" % html_diff_class
    clos_span = "</span>"

    ret_str1 = []
    ret_str2 = []

    for str1, str2 in zip(str1, str2):
        temp_str1 = ""
        temp_str2 = ""

        finished = 0
        sm.set_seqs(str1, str2)

        for m in sm.get_matching_blocks():
            temp_str1 += diff_span + str1[finished:m[0]] + clos_span
            temp_str1 += same_span + str1[m[0]:m[0]+m[2]] + clos_span
            temp_str2 += diff_span + str2[finished:m[1]] + clos_span
            temp_str2 += same_span + str2[m[1]:m[1]+m[2]] + clos_span

            finished = m[0]+m[2]

        ret_str1 += [temp_str1]
        ret_str2 += [temp_str2]



    return ret_str1, ret_str2
Esempio n. 17
0
    def compare(self, statement_a, statement_b):
        """
        Compare the two input statements.

        :return: The percent of similarity between the text of the statements.
        :rtype: float
        """

        # Return 0 if either statement has a falsy text value
        if not statement_a.text or not statement_b.text:
            return 0

        # Get the lowercase version of both strings
        statement_a_text = str(statement_a.text.lower())
        statement_b_text = str(statement_b.text.lower())

        similarity = SequenceMatcher(
            None,
            statement_a_text,
            statement_b_text
        )

        # Calculate a decimal percent of the similarity
        percent = round(similarity.ratio(), 2)

        return percent
Esempio n. 18
0
def diff_stat(old, new):
    result = [0, 0]  # [ADDED, REMOVED]

    def insert(i1, i2, j1, j2):
        result[ADDED] += j2 - j1

    def delete(i1, i2, j1, j2):
        result[REMOVED] += i2 - i1

    def update(i1, i2, j1, j2):
        result[REMOVED] += i2 - i1
        result[ADDED] += j2 - j1

    opcode_handler = {
        'insert': insert,
        'delete': delete,
        'replace': update,
        'equal': None,
    }

    sm = SequenceMatcher(None, old, new)

    for (tag, i1, i2, j1, j2) in sm.get_opcodes():
        f = opcode_handler[tag]
        if callable(f):
            f(i1, i2, j1, j2)

    return result
Esempio n. 19
0
def versioning(lists):
    """
    Compute the lifetime of every element from an iterable of sequences. It 
    returns an iterable of :class:`Versioned` classes to indicate the lifetimes.
    
    The computation is backed by the built-in :mod:`difflib` module. As such,
    every element must be hashable. 
    """
    
    ci = chain.from_iterable
    
    sm = SequenceMatcher()
    oldVersions = [ [] ]
    
    for newName, (oldList, newList) in enumerate(_pairwise(lists, [])):
        sm.set_seqs(oldList, newList)
        
        newVersions = [ oldVersions[0] ]

        for op, oldStart, oldEnd, newStart, newEnd in sm.get_opcodes():
            if op == 'equal':
                for i in range(oldStart+1, oldEnd+1):
                    oldVersions[i][0].high = newName
                newVersions.extend(oldVersions[oldStart+1:oldEnd+1])

            if op == 'delete' or op == 'replace':
                newVersions[-1].extend(ci(oldVersions[oldStart+1:oldEnd+1]))
            
            if op == 'insert' or op == 'replace':
                newVersions.extend([Versioned(x, newName, newName)] for x in newList[newStart:newEnd])
        
        oldVersions = newVersions
    
    return ci(oldVersions)
Esempio n. 20
0
    def items(self):
        source = self.context.__parent__.__name__
        source = Eq('stiam.ro.source', source)

        tags = [self.context.__name__]
        tags = AnyOf('stiam.ro.tags', tags)

        query = SearchQuery(source).And(tags)
        brains = query.searchResults(sort_index='stiam.ro.effective',
                                     reverse=True, limit=30)

        duplicate = ""
        index = 0
        for brain in brains:
            if index >= 15:
                raise StopIteration

            title = getattr(brain, "title", "")
            s = SequenceMatcher(lambda x: x == "", title, duplicate)
            if s.ratio() > 0.6:
                continue

            duplicate = title
            index += 1
            yield brain
Esempio n. 21
0
    def drawFigureMono(self):
        print("Monocoeur")
        debutTime = time.time()
        #Creation liste de listes vide
        arr = []
        for n1 in range(0,len(self.listResp)):
            arr.append([])
            for n2 in range(0,len(self.listResp)):
                arr[n1].append([])
    
        #for all response couple
        longueur = len(self.listResp)	
        for n1 in range(0,longueur):
            print("Image R : ligne "+str(n1)+" sur "+str(longueur))
            for n2 in range(n1,longueur):
                #responses of index n1 and n2
                d1 = self.listResp[n1]
                d2 = self.listResp[n2]
                #Ratio
                s = SequenceMatcher(lambda x: x == " ",d1,d2)
                ratio = s.ratio()
                arr[n1][n2] = ratio
                arr[n2][n1] = ratio

        finTime = time.time()
        diffTime = finTime - debutTime
        difftuple = time.gmtime(diffTime)
        print("Executé en "+str(difftuple.tm_min)+" min et "+str(difftuple.tm_sec)+" sec")

        f = open("./sortieR_"+str(self.name), "w")
        for n1 in range(0,len(arr)):
            for n2 in range(0,len(arr)):
                f.write(str(arr[n1][n2])+" ")
            f.write("\n")
        f.close()	  
    def WordByWord(self, str1, str2, bestRatio):
      ''''''
      try:
	# Getting best score word-by-word
	word1 = str1.split()
	word2 = str2.split()
	
	listing = []
	for w in word1:
	  if len(w)>1:
	    w = self.TextCleanup(w)
	    highest = 0.0
	    curr_word = [w, '', highest]
	    for v in word2:
	      if len(v)>1:
		v = self.TextCleanup(v)
		s = SequenceMatcher(None, w, v)
		ratio = s.ratio()
		#print "   - comparing: [",w,"/", v, "]:", ratio
		if ratio >= highest:
		  highest = ratio
		  curr_word[1] = v
		  curr_word[2] = ratio
	    if curr_word[2]>0.0:
	      #print "   ",curr_word
	      listing.append(curr_word)
	    #print "="*20
	
	# Checking average of matches
	sumed = 0.0
	hits = 0.0
	length = len(listing)
	for word in listing:
	  sumed += word[2]
	  if word[2]>=0.8:
	    hits+=1
	average = (sumed/length)
	hitsPercentage = (hits/length)
	#print "Length:", length
	#print "Avg:", average
	#print "Hits:", hitsPercentage
	
	msg = "  Best match is:\n\t ratio:\t\t"+str(bestRatio)+ 	\
			       "\n\t best:\t\t"+str1+		\
			       "\n\t original:\t"+str2+		\
			       "\n\t average:\t"+str(average)+		\
			       "\n\t hits:\t\t"+str(hitsPercentage)
	self.logger.debug(msg)
	isGoodTitle = average >= ratio or hitsPercentage >= 0.7
	return isGoodTitle
      except Exception as inst:
	exc_type, exc_obj, exc_tb = sys.exc_info()
	exception_fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
	exception_line = str(exc_tb.tb_lineno) 
	exception_type = str(type(inst))
	exception_desc = str(inst)
	self.logger.debug( "  %s: %s in %s:%s"%(exception_type, 
					  exception_desc, 
					  exception_fname,  
					  exception_line ))
Esempio n. 23
0
def ensure_files_present(original_file_dict, modified_file_dict):
    """
    Ensures that all files are available as keys in both dicts.

    :param original_file_dict: Dict of lists of file contents before  changes
    :param modified_file_dict: Dict of lists of file contents after changes
    :return:                   Return a dictionary of renamed files.
    """
    original_files = set(original_file_dict.keys())
    modified_files = set(modified_file_dict.keys())
    affected_files = original_files | modified_files
    original_unique_files = affected_files - modified_files
    renamed_files_dict = {}
    for file in filter(
            lambda filter_file: filter_file not in original_files,
            affected_files):
        for comparable_file in original_unique_files:
            s = SequenceMatcher(
                None,
                ''.join(modified_file_dict[file]),
                ''.join(original_file_dict[comparable_file]))
            if s.real_quick_ratio() >= 0.5 and s.ratio() > 0.5:
                renamed_files_dict[comparable_file] = file
                break
        else:
            original_file_dict[file] = []
    for file in filter(
            lambda filter_file: filter_file not in modified_files,
            affected_files):
        modified_file_dict[file] = []
    return renamed_files_dict
Esempio n. 24
0
def scan_company_names(name_list, name1, results=0, ro_thresold=None):
    """Scan a list of company names, searching for best matches against
    the given name.  Notice that this function takes a list of
    strings, and not a list of dictionaries."""
    if ro_thresold is not None: RO_THRESHOLD = ro_thresold
    else: RO_THRESHOLD = 0.6
    sm1 = SequenceMatcher()
    sm1.set_seq1(name1.lower())
    resd = {}
    withoutCountry = not name1.endswith(']')
    for i, n in name_list:
        # XXX: on Symbian, here we get a str; not sure this is the
        #      right place to fix it.
        if isinstance(n, str):
            n = unicode(n, 'latin1', 'ignore')
        o_name = n
        var = 0.0
        if withoutCountry and n.endswith(']'):
            cidx = n.rfind('[')
            if cidx != -1:
                n = n[:cidx].rstrip()
                var = -0.05
        # Distance with the company name.
        ratio = ratcliff(name1, n, sm1) + var
        if ratio >= RO_THRESHOLD:
            if resd.has_key(i):
                if ratio > resd[i][0]: resd[i] = (ratio,
                                            (i, analyze_company_name(o_name)))
            else:
                resd[i] = (ratio, (i, analyze_company_name(o_name)))
    res = resd.values()
    res.sort()
    res.reverse()
    if results > 0: res[:] = res[:results]
    return res
def WikiDocument(out, user_from, user_to, timestamp, subject, text):
    global previous
    ###url = get_url(id, prefix)
    ###header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
    ##############text = clean(text)
    subject = clean(subject)
    header = '%s\t%s\t%s\t%s\t' % (user_to, user_from, timestamp, subject)
    header = header.encode('utf-8')

    text = clean(text)
    ###find the diff
    s = SequenceMatcher(None, previous, text)
    opcodes = s.get_opcodes()

    diff = []
    for i in opcodes:
        if i[0] == 'insert' or i[0] == 'replace':
            j1 = i[3]
            j2 = i[4]
            diff.append(text[j1:j2])

    diff = "".join(diff)
    ###diff = clean(diff)
    ###
    out.reserve(len(header) + len(subject) + len(diff))
    print >> out, header,
    print >> out, diff.encode('utf-8')
    previous = text
Esempio n. 26
0
 def get_from_text(self, old, new):
     """
     Gets the differences between `old` text and `new` text and returns a changeset
     :param old: old Text object
     :param new: new text string
     """
     olds = str(old)
     if olds == new:
         return None
     print repr(olds), repr(new)
     sm = SequenceMatcher(None, olds, new)
     print "    CS LENS  ", len(olds), len(new)
     csd = dict(old_len=len(olds),
                 new_len=len(new),
                 ops="",
                 char_bank="")
     opcodes = [opcode_tup for opcode_tup in sm.get_opcodes()]
     last_op = 0
     print "    CS OPC 1", opcodes
     for i in range(0, len(opcodes)):
         if opcodes[i][0] != "equal":
             last_op = i
     print "    CS OPC 2", opcodes[:last_op+1]
     for opcode_tup in opcodes[:last_op+1]:
         op_code_match(*opcode_tup, changeset=csd, sm=sm, text=old)
     print "    CS CSD  ", csd
     return csd
Esempio n. 27
0
File: diff.py Progetto: fpt/webtoys
    def _compare_lines(self, la, lb):
        sa = '\n'.join(la)
        sb = '\n'.join(lb)
        ta_result = ''
        tb_result = ''

        str_diff_start = '<em class="str-diff">'
        str_diff_end = '</em>'

        s = SequenceMatcher(None, sa, sb)
        cnt_a = Counter()
        cnt_b = Counter()
        for block in s.get_matching_blocks():
            (a_idx, b_idx, nmatch) = block
            print("a[%d] and b[%d] match for %d elements" % block)
            cnt_a.progress(a_idx, nmatch)
            cnt_b.progress(b_idx, nmatch)

            diff_a = cnt_a.slice_diff(sa)
            same_a = cnt_a.slice_match(sa)
            diff_b = cnt_b.slice_diff(sb)
            same_b = cnt_b.slice_match(sb)

            if diff_a or diff_b:
                ta_result += self._enclose(str_diff_start, diff_a, str_diff_end, consider_newline = True)
            ta_result += same_a
            if diff_a or diff_b:
                tb_result += self._enclose(str_diff_start, diff_b, str_diff_end, consider_newline = True)
            tb_result += same_b

            cnt_a.next()
            cnt_b.next()
        return (ta_result.split('\n'), tb_result.split('\n'))
def sanitizedStrCheck(str1, str2, location):
    if (isinstance(str1, str) and isinstance(str2, str)) or (isinstance(str1, unicode) and isinstance(str2, unicode)):
        m = SequenceMatcher(None, str1, str2)
        rat = m.ratio()
        str1 = str1.replace("\n", " ")
        str2 = str2.replace("\n", " ")
        if ''.join(filter(lambda c: c in string.printable, str1)) == ''.join(filter(lambda c: c in string.printable, str2)):
            return True
        elif str1.strip() == str2.strip():
            return True
        elif unicodedata.normalize('NFKD', str1).encode('ascii','ignore') == unicodedata.normalize('NFKD', str2).encode('ascii','ignore'):
            return True
        elif str1.replace(" ", "") == str2.replace(" ", ""):
            return True
        elif rat > .8 and not str1.__contains__('http://'):
            #print rat
            #print "SERVER:"
            #print repr(str1)
            #print "CLIENT:"
            #print repr(str2)
            return True
        elif str1.__contains__('http://') and str2.__contains__('http://') and urllib.quote_plus(str2.replace(" ", "%20")) == urllib.quote_plus(str1):
            #print str1
            #print str2
            #print urllib.quote_plus(str1)[:150]
            #print urllib.quote_plus(str2.replace(" ", "%20"))[:150]
            return True
    return False
Esempio n. 29
0
def partial_ratio(s1,  s2):

    if s1 is None: raise TypeError("s1 is None")
    if s2 is None: raise TypeError("s2 is None")

    if len(s1) <= len(s2):
        shorter = s1; longer = s2;
    else:
        shorter = s2; longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:

        long_start   = block[1] - block[0]
        long_end     = long_start + len(shorter)
        long_substr  = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995: return 100
        else: scores.append(r)

    return int(100 * max(scores))
Esempio n. 30
0
def checkNear(hash1, hash2):
    s = SequenceMatcher(None, hash1, hash2)
    r = s.ratio()
    if r >= 0.96:
        return True
    else:
        return False
Esempio n. 31
0
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
Esempio n. 32
0
 def match(self, a, b):
     return SequenceMatcher(None, a, b).ratio()
Esempio n. 33
0
        # get negated part of the sentence
        with open(data_dir + 'ntree_tmp', 'w') as fw:
            fw.write(t)
        s = re.sub('\([A-Z]*\$? |\(-[A-Z]+- |\)|\)|\(, |\(. ', '', t)
        print('neg part: ' + s)

        # find what neg term is matched and use its neg type
        try:
            m = ''
            for neg in [
                    x for x in sorted(
                        neg_list['ITEM'].tolist(), key=len, reverse=True)
            ]:
                #for neg in ['negative for']:
                match = SequenceMatcher(None, s, neg).find_longest_match(
                    0, len(s), 0, len(neg))
                matched_string = s[match.a:match.a + match.size]
                try:  # if next char might be different, means partial match
                    if s[match.a + match.size + 1] == neg[match.b + match.size + 1] and \
                       s[match.a + match.size + 2] == neg[match.b + match.size + 2]:
                        if (len(matched_string) > len(m)) and \
                            ((matched_string[0] == s[0] and matched_string[1] == s[1]) or \
                             (matched_string[len(matched_string)-1] == s[len(s)-1] and matched_string[len(matched_string)-2] == s[len(s)-2])): # either match from the beginning or laast
                            m = matched_string
                            matched_neg_item = neg[match.b:match.b +
                                                   match.size]
                            if matched_neg_item[len(matched_neg_item) -
                                                1] == ' ':
                                matched_neg_item = matched_neg_item[
                                    0:len(matched_neg_item) - 1]
                    else:
Esempio n. 34
0
def compare_map(first_id, second_id):
    return SequenceMatcher(None, first_id, second_id).ratio()
Esempio n. 35
0
def strdistance(a, b):
    return SequenceMatcher(None, a, b).ratio()
Esempio n. 36
0
 def check(message: discord.Message):
     # return str(message.content).lower() == str(answer).lower()
     return SequenceMatcher(None, str(answer).lower(), str(message.content).lower()).ratio() > float(0.5) and message.channel.id == chat_channel
    def test_diff(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        seq1 = "ab ab2 abc3 abcd abc4".split()
        seq2 = "ab ab2 abc3 abc4 abc adb".split()
        diff = SequenceMatcher(a=seq1, b=seq2)
        nb = 0
        for opcode in diff.get_opcodes():
            fLOG(opcode)
            nb += 1
        self.assertEqual(nb, 4)

        h = 20
        size = 500, 500
        white = 255, 255, 255

        if is_travis_or_appveyor() in ("travis",):
            # pygame.error: No available video device
            return
        import pygame
        if is_travis_or_appveyor() == "circleci":
            # os.environ["SDL_VIDEODRIVER"] = "x11"
            flags = pygame.NOFRAME
        else:
            flags = 0

        pygame, screen, fonts = get_pygame_screen_font(h, size, flags=flags)

        from ensae_teaching_cs.helpers.pygame_helper import wait_event

        bars = [random.randint(10, 500) / 500.0 for s in seq2]
        screen.fill(white)
        build_diff_image(pygame, screen, h=h, maxw=size[1], seq1=seq1, seq2=seq2, diff=diff,
                         fonts=fonts, bars=bars)
        pygame.display.flip()
        temp = get_temp_folder(__file__, "temp_video_diff")

        for i in range(0, 21):
            screen.fill(white)
            build_diff_image(pygame, screen, h=h, maxw=size[0], seq1=seq1, seq2=seq2, diff=diff,
                             fonts=fonts, bars=bars, progress=i / 20.0, prev_bars=None)
            pygame.time.wait(60)
            pygame.display.flip()
            pygame.image.save(screen, os.path.join(temp, "diff%d.png" % i))

        if __name__ == "__main__":

            from ensae_teaching_cs.helpers.video_helper import make_video
            png = [os.path.join(temp, _)
                   for _ in os.listdir(temp) if ".png" in _]
            out = os.path.join(temp, "diff.avi")
            make_video(png, out, size=(350, 250), format="XVID", fps=5)

            wait_event(pygame)

        for font in fonts.values():
            del font
        pygame.quit()
Esempio n. 38
0
def similar(str_a, str_b):
    seq = SequenceMatcher(None, str_a, str_b)
    return seq.ratio()
def string_sim(sent_pairs):
    """Create a matrix where every row is a pair of sentences and every column in a feature.
    Feature (column) order is not important to the algorithm."""

    features = [
        "NIST", "BLEU", "Word Error Rate", "Longest common substring",
        "Levenshtein distance"
    ]
    nist_list = []
    bleu_list = []
    wer_list = []
    lcs_list = []
    dist_list = []
    for pair in sent_pairs:
        t1 = pair[0]
        t2 = pair[1]
        t1_token = word_tokenize(pair[0])
        t2_token = word_tokenize(pair[1])

        # NIST
        try:
            nist1 = nist_score.sentence_nist([
                t2_token,
            ], t1_token)
            nist2 = nist_score.sentence_nist([
                t1_token,
            ], t2_token)
            nist = nist1 + nist2
        except ZeroDivisionError:
            nist = 0
        nist_list.append(nist)

        # BLEU
        bleu1 = bleu_score.sentence_bleu([
            t1_token,
        ], t2_token)
        bleu2 = bleu_score.sentence_bleu([
            t2_token,
        ], t1_token)
        bleu_list.append(bleu1 + bleu2)

        # Longgest common substring
        s = SequenceMatcher(None, t1, t2)
        lcs = s.find_longest_match(0, len(t1), 0, len(t2))
        lcs_list.append(lcs[2])

        # Edit distance
        dist = edit_distance(t1, t2)
        dist_list.append(dist)

        # Word error rate
        dist_wer = edit_distance(t1_token, t2_token)
        wer = dist_wer / len(t1_token) + dist_wer / len(t2_token)
        wer_list.append(wer)

    all_list = [nist_list, bleu_list, wer_list, lcs_list, dist_list]
    X = np.zeros((len(sent_pairs), len(features)))
    for i in range(len(all_list)):
        X[:, i] = np.asarray(all_list[i])

    return X
Esempio n. 40
0
def similar_strings(string1, string2):
    return round(SequenceMatcher(None, string1, string2).ratio(), 2)
Esempio n. 41
0
        SubStationDict[subName].append(BusNumber)
#print subName
"""
with open('subList.txt','w') as f:
	for name in list(subNameSet):
		f.write(name)
		f.write('\n')
"""

for planningBusName in NameMatchDictPlanning.keys():
    similarityDict = {}
    for CAPEsubName in list(subNameSet):
        planningBusNameCompact = getCompactString(planningBusName)
        CAPEsubNameCompact = getCompactString(CAPEsubName)
        similarity = SequenceMatcher(None, planningBusNameCompact,
                                     CAPEsubNameCompact).ratio()
        if similarity > similarityThreshold:
            similarityDict[CAPEsubName] = similarity

    similarityDictSorted = sorted(
        similarityDict, key=similarityDict.get, reverse=True
    )  # gets the dictionary keys in descending order of the values

    NameMatchDictPlanning[planningBusName] = similarityDictSorted

# output the sorted name match list
with open(NameMatchSorted, 'w') as f:
    for planningBusName in NameMatchDictPlanning.keys():
        string = planningBusName + ' -> '
        for CAPEsubName in NameMatchDictPlanning[planningBusName]:
            string += str(CAPEsubName)
Esempio n. 42
0
    loweredToLink = {
        key.lower(): value
        for (key, value) in current.links.items()
    }
    lowered = loweredToLink

    for bad_word in bad_words:
        lowered = list(filter(lambda x: not bad_word in x, lowered))

    while not guess in lowered:
        guess = input("> ").lower()
        if guess == 'help':
            print('\n'.join(lowered))
        else:
            candidates = list(
                map(lambda x: (x, SequenceMatcher(None, x, guess).ratio()),
                    lowered))
            candidates = list(filter(lambda x: x[1] > 0.8, candidates))
            candidates.sort(reverse=True, key=lambda x: x[1])
            if len(candidates) == 1:
                guess = candidates[0][0]
                print('-->', guess)
            else:
                print('\n'.join([c[0] for c in candidates]))

    current = loweredToLink[guess]
    counter += 1
    print('page', counter)
    printPageInfo(current)
Esempio n. 43
0
def get_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()
Esempio n. 44
0
 def _get_product_distance_to_query_str(product):
     ratio = SequenceMatcher(None, product.name,
                             query_str).quick_ratio()
     return (1 / ratio if ratio else 0)
Esempio n. 45
0
 def longest_match_ratio(str1, str2):
     sq = SequenceMatcher(lambda x: x == " ", str1, str2)
     match = sq.find_longest_match(0, len(str1), 0, len(str2))
     return MathUtil.try_divide(match.size, min(len(str1), len(str2)))
Esempio n. 46
0
 def __init__(self, buis):
     self.buis = buis.title()
     self.seq = SequenceMatcher()
Esempio n. 47
0
 def __similar(self, a, b):
     return SequenceMatcher(None, a, b).ratio()
Esempio n. 48
0
 def compare_string(self, item1, item2):
     """
     Compare two strings and output similarities
     """
     from difflib import SequenceMatcher
     return SequenceMatcher(None, item1, item2).ratio()
Esempio n. 49
0
 def match_sequence(cls, a, b):
     sm = SequenceMatcher(a=' '.join(a), b=' '.join(b))
     return sm.ratio()
Esempio n. 50
0
 def longest_match_size(str1, str2):
     sq = SequenceMatcher(lambda x: x == " ", str1, str2)
     match = sq.find_longest_match(0, len(str1), 0, len(str2))
     return match.size
Esempio n. 51
0
def ratio(a, b):
    m = SequenceMatcher(None, a, b)
    return int(round(100 * m.ratio()))
Esempio n. 52
0
def similar(lhs, rhs):
    return SequenceMatcher(None, lhs, rhs).ratio()
Esempio n. 53
0
        help='Simplified intermediate delta (unstable)')
    group.add_argument('-c', '--compare', action='store_true',
        help='HTML comparison of tokenized diff to char diffs')
    data = parser.parse_args()

    lexer = pygments.lexers.get_lexer_by_name(data.lexername)

    a = data.file1.read()
    b = data.file2.read()

    data.unidiff = not data.verbose and not data.delta and not data.compare

    if data.verbose:
        lexa = list(pygments.lex(a, lexer))
        lexb = list(pygments.lex(b, lexer))
        sm = SequenceMatcher(None, lexa, lexb)
        for op, a1, a2, b1, b2 in sm.get_opcodes():
            if op == 'equal':
                for item in lexa[a1:a2]:
                    data.out.write("  %s: %s\n" % item)
            elif op == 'replace':
                data.out.write("~~~\n")
                for item in lexa[a1:a2]:
                    data.out.write("- %s: %s\n" % item)
                for item in lexb[b1:b2]:
                    data.out.write("+ %s: %s\n" % item)
                data.out.write("~~~\n")
            elif op == 'insert':
                for item in lexb[b1:b2]:
                    data.out.write("+ %s: %s\n" % item)
            elif op == 'delete':
Esempio n. 54
0
def get_font(name: FontType, size: int) -> '__font.Font':
    """
    Return a :py:class:`pygame.font.Font` object from a name or file.

    :param name: Font name or path
    :param size: Font size (px)
    :return: Font object
    """
    assert_font(name)
    assert isinstance(size, int)

    font: Optional['__font.Font']
    if isinstance(name, __font.Font):
        font = name
        return font

    else:
        name = str(name)

        if name == '':
            raise ValueError('font name cannot be empty')

        if size <= 0:
            raise ValueError('font size cannot be lower or equal than zero')

        # Font is not a file, then use a system font
        if not path.isfile(name):
            font_name = name
            name = __font.match_font(font_name)

            if name is None:  # Show system available fonts
                from difflib import SequenceMatcher
                from random import randrange
                system_fonts = __font.get_fonts()

                # Get the most similar example
                most_similar = 0
                most_similar_index = 0
                for i in range(len(system_fonts)):
                    # noinspection PyArgumentEqualDefault
                    sim = SequenceMatcher(None, system_fonts[i],
                                          font_name).ratio()  # Similarity
                    if sim > most_similar:
                        most_similar = sim
                        most_similar_index = i
                sys_font_sim = system_fonts[most_similar_index]
                sys_suggestion = 'system font "{0}" unknown, use "{1}" instead'.format(
                    font_name, sys_font_sim)
                sys_message = 'check system fonts with pygame.font.get_fonts() function'

                # Get examples
                examples_number = 3
                examples = []
                j = 0
                for i in range(len(system_fonts)):
                    font_random = system_fonts[randrange(0, len(system_fonts))]
                    if font_random not in examples:
                        examples.append(font_random)
                        j += 1
                    if j >= examples_number:
                        break
                examples.sort()
                fonts_random = ', '.join(examples)
                sys_message_2 = 'some examples: {0}'.format(fonts_random)

                # Raise the exception
                raise ValueError('{0}\n{1}\n{2}'.format(
                    sys_suggestion, sys_message, sys_message_2))

        # Try to load the font
        font = None
        if (name, size) in _cache:
            return _cache[(name, size)]
        try:
            font = __font.Font(name, size)
        except IOError:
            pass

        # If font was not loaded throw an exception
        if font is None:
            raise IOError('font file "{0}" cannot be loaded'.format(font))
        _cache[(name, size)] = font
        return font
Esempio n. 55
0
 def similar(self, choice):
     for option in self.options:
         if SequenceMatcher(None, choice, option).ratio() > 0.6:
             return option
     return False
Esempio n. 56
0
class Deduplicator:

    logger = util.get_logger("deduplicator.Deduplicator")
    threshold = 0.50
    boost = 0.10

    def __init__(self):
        self.sm = SequenceMatcher()
        self.tokenizer = Tokenizer()
        self.ner = NER()
        self.headlines = dict()
        self._headlines = dict()
        self.parents = dict()
        self.groups = dict()

    def accept(self, _id: str, headline: str) -> str:
        self.headlines[_id] = headline
        tokens = self.tokenizer.tokenize(headline)
        _headline = ' '.join(tokens)
        self._headlines[_id] = _headline

        if len(self.groups) == 0:
            self.logger.debug("[%s] %s - first item", _id, headline)
            self.parents[_id] = _id
            self.groups[_id] = []
            return _id

        matches = []
        a = _headline
        doc1 = self.ner.doc(headline)
        ents1 = util.lowercase(self.ner.entities(doc1))
        for group_id in self.groups:
            b = self._headlines[group_id]
            self.sm.set_seqs(a, b)
            ratio = self.sm.ratio()
            # Check if there are any named entities in common
            doc2 = self.ner.doc(self.headlines[group_id])
            ents2 = util.lowercase(self.ner.entities(doc2))
            ncommon = len(set(ents1) & set(ents2))
            boost = ncommon * self.boost
            ratio += boost
            self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f (+%.2f)", _id, a,
                              group_id, b, ratio, boost)
            if ratio >= self.threshold:
                matches.append((ratio, group_id))

        if not matches:
            self.logger.debug("[%s] %s - no matches found", _id, headline)
            self.parents[_id] = _id
            self.groups[_id] = []
            return _id

        matches.sort(key=lambda x: x[0])
        highest_ratio, group_id = matches.pop()
        b = self._headlines[group_id]
        self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f was the high score",
                          _id, a, group_id, b, highest_ratio)
        self.parents[_id] = group_id
        self.groups[group_id].append(_id)
        return group_id

    def print_tree(self, original=True):
        headlines = self.headlines if original else self._headlines
        print("")
        for group_id in self.groups:
            print("[%s] %s" % (group_id, headlines[group_id]))
            if self.groups[group_id]:
                print(" |")
            for _id in self.groups[group_id]:
                print(" |-- [%s] %s" % (_id, headlines[_id]))
            if self.groups[group_id]:
                print("")
        print("")

    def export(self):
        return {
            'headlines': self.headlines,
            '_headlines': self._headlines,
            'parents': self.parents,
            'groups': self.groups
        }
Esempio n. 57
0
def cacl_similarRatio(a, b):
    """
    返回两个单词之间的相似率

    """
    return SequenceMatcher(None, a, b).ratio()
Esempio n. 58
0
def get_winner(year):
    make_year(year)
    global GG_RESULT
    if "winner" in GG_RESULT:
        return GG_RESULT["winner"]

    stop_list_people = [
        'asian', 'series', 'the', 'best', '-', 'award', 'for', 'or', 'made',
        'in', 'a', 'by', 'performance', 'an', 'golden', 'globes', 'role',
        'motion', 'picture', 'best', 'supporting'
    ]
    #stop_list_people =['Motion Picture','Best Actor','Best Supporting']

    tweets_by_awards(year)

    winners = {}
    for award in OFFICIAL_AWARDS:
        winners[award] = []

    # print(tweet_award_dict)

    name_pattern = re.compile(r'[A-Z][a-z]+\s[A-Z][a-z]+')
    award_list_person = []
    for award in OFFICIAL_AWARDS:
        for person in ["actor", "actress", "demille", "director"]:
            if person in award:
                award_list_person.append(award)

    for award in award_list_person:
        for tweet in TWEET_BY_AWARD_DICT[award]:
            names = re.findall(name_pattern, tweet)
            for name in names:
                flag = False
                for name_item in name.lower().split():
                    if name_item in stop_list_people:
                        flag = True
                if flag == False:
                    winners[award] = winners[award] + [name]

    freq = {}
    for award in award_list_person:
        freq[award] = nltk.FreqDist(winners[award])

    # winner list for the rest
    award_list_not_person = []
    for award in OFFICIAL_AWARDS:
        if award not in award_list_person:
            award_list_not_person.append(award)

    for award in award_list_not_person:

        winner_stoplist = [
            'globes', 'at', 'and', 'Motion', 'Picture', 'Best', 'Supporting',
            '-', 'animated', 'best', 'comedy', 'drama', 'feature', 'film',
            'foreign', 'globe', 'goes', 'golden', 'motion', 'movie', 'musical',
            'or', 'original', 'picture', 'rt', 'series', 'song', 'television',
            'to', 'tv', 'movies'
        ]
        bigrams_list = []
        ignore_list = ["@", "#"]
        post_process = ['wins', 'goldenglobes']

        for tweet in TWEET_BY_AWARD_DICT[award]:

            tweet = re.sub(r'[^\w\s]', '', tweet)
            if tweet[0:2] == "RT":
                #print (tweet)
                continue

            bigram = nltk.bigrams(tweet.split())

            temp = []
            for item in bigram:
                if item[0].lower() not in winner_stoplist and item[1].lower(
                ) not in winner_stoplist:
                    temp.append(item)

            for item in temp:
                if item[0][0] not in ignore_list and item[1][
                        0] not in ignore_list:
                    bigrams_list.append(item)


#         print(bigrams_list)

        freq[award] = nltk.FreqDist([' '.join(item) for item in bigrams_list])

    for award in OFFICIAL_AWARDS:
        #print(freq[award].most_common(1))
        temp_winner = freq[award].most_common(1)[0][0]
        imdb_flag = True
        for word in post_process:
            if word in temp_winner.lower().split():
                #print ('check')
                #print (temp_winner)
                temp_winner = temp_winner.lower().replace(word, '').strip()
                #print (temp_winner)
                #print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
                imdb_flag = False
                break
        # winners[award] = temp_winner.lower()
        if award in award_list_person:
            winners[award] = temp_winner.lower()
        else:
            if imdb_flag == False:
                winners[award] = temp_winner.lower()
            else:
                if award != 'best original song - motion picture':
                    movies = ia.search_movie(temp_winner)
                    ss = ''
                    #print (movies)
                    #print ('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
                    for item in movies:
                        try:
                            if item['year'] == int(year) - 1:
                                ss = item['title']
                                break
                        except KeyError:
                            continue
                    if ss == '':
                        winners[award] = temp_winner.lower()
                    else:
                        print(ss)
                        print(temp_winner)
                        print(SequenceMatcher(None, ss, temp_winner).ratio())
                        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
                        if SequenceMatcher(None, ss,
                                           temp_winner).ratio() > 0.85:
                            winners[award] = temp_winner.lower()
                        else:
                            winners[award] = ss.lower()
                else:
                    winners[award] = temp_winner.lower()
    GG_RESULT["winner"] = winners
    return winners
Esempio n. 59
0
def compare_output(s1, s2):
    """ Compare stdout strings s1 and s2.
        s1 is from readelf, s2 from elftools readelf.py
        Return pair success, errmsg. If comparison succeeds, success is True
        and errmsg is empty. Otherwise success is False and errmsg holds a
        description of the mismatch.

        Note: this function contains some rather horrible hacks to ignore
        differences which are not important for the verification of pyelftools.
        This is due to some intricacies of binutils's readelf which pyelftools
        doesn't currently implement, features that binutils doesn't support,
        or silly inconsistencies in the output of readelf, which I was reluctant
        to replicate. Read the documentation for more details.
    """
    def prepare_lines(s):
        return [line for line in s.lower().splitlines() if line.strip() != '']

    def filter_readelf_lines(lines):
        filter_out = False
        for line in lines:
            if 'of the .eh_frame section' in line:
                filter_out = True
            elif 'of the .debug_frame section' in line or \
                'of the .zdebug_frame section' in line:
                filter_out = False
            if not filter_out:
                if not line.startswith('unknown: length'):
                    yield line

    lines1 = prepare_lines(s1)
    lines2 = prepare_lines(s2)

    lines1 = list(filter_readelf_lines(lines1))

    flag_after_symtable = False

    if len(lines1) != len(lines2):
        return False, 'Number of lines different: %s vs %s' % (len(lines1),
                                                               len(lines2))

    for i in range(len(lines1)):
        if 'symbol table' in lines1[i]:
            flag_after_symtable = True

        # Compare ignoring whitespace
        lines1_parts = lines1[i].split()
        lines2_parts = lines2[i].split()

        if ''.join(lines1_parts) != ''.join(lines2_parts):
            ok = False

            try:
                # Ignore difference in precision of hex representation in the
                # last part (i.e. 008f3b vs 8f3b)
                if (''.join(lines1_parts[:-1]) == ''.join(lines2_parts[:-1])
                        and int(lines1_parts[-1], 16) == int(
                            lines2_parts[-1], 16)):
                    ok = True
            except ValueError:
                pass

            sm = SequenceMatcher()
            sm.set_seqs(lines1[i], lines2[i])
            changes = sm.get_opcodes()
            if flag_after_symtable:
                # Detect readelf's adding @ with lib and version after
                # symbol name.
                if (len(changes) == 2 and changes[1][0] == 'delete'
                        and lines1[i][changes[1][1]] == '@'):
                    ok = True
            elif 'at_const_value' in lines1[i]:
                # On 32-bit machines, readelf doesn't correctly represent
                # some boundary LEB128 numbers
                val = lines2_parts[-1]
                num2 = int(val, 16 if val.startswith('0x') else 10)
                if num2 <= -2**31 and '32' in platform.architecture()[0]:
                    ok = True
            elif 'os/abi' in lines1[i]:
                if 'unix - gnu' in lines1[i] and 'unix - linux' in lines2[i]:
                    ok = True
            elif ('unknown at value' in lines1[i]
                  and 'dw_at_apple' in lines2[i]):
                ok = True
            else:
                for s in ('t (tls)', 'l (large)'):
                    if s in lines1[i] or s in lines2[i]:
                        ok = True
                        break
            if not ok:
                errmsg = 'Mismatch on line #%s:\n>>%s<<\n>>%s<<\n (%r)' % (
                    i, lines1[i], lines2[i], changes)
                return False, errmsg
    return True, ''
Esempio n. 60
0
def get_diff(job_id, v1, v2, n1=None, n2=None):
    job = fetch(Job, id=job_id)
    first = str_dict(job.logs[v1]).splitlines()
    second = str_dict(job.logs[v2]).splitlines()
    opcodes = SequenceMatcher(None, first, second).get_opcodes()
    return jsonify({'first': first, 'second': second, 'opcodes': opcodes})