Python SequenceMatcher Examples, difflib.SequenceMatcher Python Examples

Example #1

0

Show file

File: htmlAnalyser.py Project: v4lproik/WIAS

 def get_difference_between_two_pages(page1, page2):
     s = difflib.SequenceMatcher(None, page1, page2)
     return s.ratio()

Example #2

0

Show file

def cse(exprs, symbols=None, optimizations=None, postprocess=None):
    """ Perform common subexpression elimination on an expression.

    Parameters
    ==========

    exprs : list of sympy expressions, or a single sympy expression
        The expressions to reduce.
    symbols : infinite iterator yielding unique Symbols
        The symbols used to label the common subexpressions which are pulled
        out. The ``numbered_symbols`` generator is useful. The default is a stream
        of symbols of the form "x0", "x1", etc. This must be an infinite
        iterator.
    optimizations : list of (callable, callable) pairs, optional
        The (preprocessor, postprocessor) pairs. If not provided,
        ``sympy.simplify.cse.cse_optimizations`` is used.
    postprocess : a function which accepts the two return values of cse and returns
        the desired form of output from cse, e.g. if you want the replacements
        reversed the function might be lambda r, e: return reversed(r), e

    Returns
    =======

    replacements : list of (Symbol, expression) pairs
        All of the common subexpressions that were replaced. Subexpressions
        earlier in this list might show up in subexpressions later in this list.
    reduced_exprs : list of sympy expressions
        The reduced expressions with all of the replacements above.
    """
    from sympy.matrices import Matrix
    from sympy.simplify.simplify import fraction

    if symbols is None:
        symbols = numbered_symbols()
    else:
        # In case we get passed an iterable with an __iter__ method instead of
        # an actual iterator.
        symbols = iter(symbols)
    seen_subexp = set()
    muls = set()
    adds = set()
    to_eliminate = []
    to_eliminate_ops_count = []

    if optimizations is None:
        # Pull out the default here just in case there are some weird
        # manipulations of the module-level list in some other thread.
        optimizations = list(cse_optimizations)

    # Handle the case if just one expression was passed.
    if isinstance(exprs, Basic):
        exprs = [exprs]

    # Preprocess the expressions to give us better optimization opportunities.
    reduced_exprs = [preprocess_for_cse(e, optimizations) for e in exprs]

    # Find all of the repeated subexpressions.
    def insert(subtree):
        '''This helper will insert the subtree into to_eliminate while
        maintaining the ordering by op count and will skip the insertion
        if subtree is already present.'''
        ops_count = (subtree.count_ops(), subtree.is_Mul
                     )  # prefer non-Mul to Mul
        index_to_insert = bisect.bisect(to_eliminate_ops_count, ops_count)
        # all i up to this index have op count <= the current op count
        # so check that subtree is not yet present from this index down
        # (if necessary) to zero.
        for i in xrange(index_to_insert - 1, -1, -1):
            if to_eliminate_ops_count[i] == ops_count and \
               subtree == to_eliminate[i]:
                return  # already have it
        to_eliminate_ops_count.insert(index_to_insert, ops_count)
        to_eliminate.insert(index_to_insert, subtree)

    for expr in reduced_exprs:
        if not isinstance(expr, Basic):
            continue
        pt = preorder_traversal(expr)
        for subtree in pt:

            inv = 1 / subtree if subtree.is_Pow else None

            if subtree.is_Atom or iterable(subtree) or inv and inv.is_Atom:
                # Exclude atoms, since there is no point in renaming them.
                continue

            if subtree in seen_subexp:
                if inv and _coeff_isneg(subtree.exp):
                    # save the form with positive exponent
                    subtree = inv
                insert(subtree)
                pt.skip()
                continue

            if inv and inv in seen_subexp:
                if _coeff_isneg(subtree.exp):
                    # save the form with positive exponent
                    subtree = inv
                insert(subtree)
                pt.skip()
                continue
            elif subtree.is_Mul:
                muls.add(subtree)
            elif subtree.is_Add:
                adds.add(subtree)

            seen_subexp.add(subtree)

    # process adds - any adds that weren't repeated might contain
    # subpatterns that are repeated, e.g. x+y+z and x+y have x+y in common
    adds = [set(a.args) for a in adds]
    for i in xrange(len(adds)):
        for j in xrange(i + 1, len(adds)):
            com = adds[i].intersection(adds[j])
            if len(com) > 1:
                insert(Add(*com))

                # remove this set of symbols so it doesn't appear again
                adds[i] = adds[i].difference(com)
                adds[j] = adds[j].difference(com)
                for k in xrange(j + 1, len(adds)):
                    if not com.difference(adds[k]):
                        adds[k] = adds[k].difference(com)

    # process muls - any muls that weren't repeated might contain
    # subpatterns that are repeated, e.g. x*y*z and x*y have x*y in common

    # use SequenceMatcher on the nc part to find the longest common expression
    # in common between the two nc parts
    sm = difflib.SequenceMatcher()

    muls = [a.args_cnc(cset=True) for a in muls]
    for i in xrange(len(muls)):
        if muls[i][1]:
            sm.set_seq1(muls[i][1])
        for j in xrange(i + 1, len(muls)):
            # the commutative part in common
            ccom = muls[i][0].intersection(muls[j][0])

            # the non-commutative part in common
            if muls[i][1] and muls[j][1]:
                # see if there is any chance of an nc match
                ncom = set(muls[i][1]).intersection(set(muls[j][1]))
                if len(ccom) + len(ncom) < 2:
                    continue

                # now work harder to find the match
                sm.set_seq2(muls[j][1])
                i1, _, n = sm.find_longest_match(0, len(muls[i][1]), 0,
                                                 len(muls[j][1]))
                ncom = muls[i][1][i1:i1 + n]
            else:
                ncom = []

            com = list(ccom) + ncom
            if len(com) < 2:
                continue

            insert(Mul(*com))

            # remove ccom from all if there was no ncom; to update the nc part
            # would require finding the subexpr and then replacing it with a
            # dummy to keep bounding nc symbols from being identified as a
            # subexpr, e.g. removing B*C from A*B*C*D might allow A*D to be
            # identified as a subexpr which would not be right.
            if not ncom:
                muls[i][0] = muls[i][0].difference(ccom)
                for k in xrange(j, len(muls)):
                    if not ccom.difference(muls[k][0]):
                        muls[k][0] = muls[k][0].difference(ccom)

    # Substitute symbols for all of the repeated subexpressions.
    replacements = []
    reduced_exprs = list(reduced_exprs)
    hit = True
    for i, subtree in enumerate(to_eliminate):
        if hit:
            sym = symbols.next()
        hit = False
        if subtree.is_Pow and subtree.exp.is_Rational:
            update = lambda x: x.xreplace({subtree: sym, 1 / subtree: 1 / sym})
        else:
            update = lambda x: x.subs(subtree, sym)
        # Make the substitution in all of the target expressions.
        for j, expr in enumerate(reduced_exprs):
            old = reduced_exprs[j]
            reduced_exprs[j] = update(expr)
            hit = hit or (old != reduced_exprs[j])
        # Make the substitution in all of the subsequent substitutions.
        for j in range(i + 1, len(to_eliminate)):
            old = to_eliminate[j]
            to_eliminate[j] = update(to_eliminate[j])
            hit = hit or (old != to_eliminate[j])
        if hit:
            replacements.append((sym, subtree))

    # Postprocess the expressions to return the expressions to canonical form.
    for i, (sym, subtree) in enumerate(replacements):
        subtree = postprocess_for_cse(subtree, optimizations)
        replacements[i] = (sym, subtree)
    reduced_exprs = [
        postprocess_for_cse(e, optimizations) for e in reduced_exprs
    ]

    if isinstance(exprs, Matrix):
        reduced_exprs = [Matrix(exprs.rows, exprs.cols, reduced_exprs)]
    if postprocess is None:
        return replacements, reduced_exprs
    return postprocess(replacements, reduced_exprs)

Example #3

0

Show file

File: wikidiff.py Project: fagan2888/wikidiff

        wiki = parse_wiki(text)
    except Exception as e:
        print()
        print('PARSE ERROR')
        print(text)
        print()
        wiki = ''
    red = reduce_wiki(wiki)
    return red.split()

# set up files
fin = open(args.source, encoding='utf-8')
fout = open(args.output, 'w', encoding='utf-8')

# create differ
sm = difflib.SequenceMatcher()

# this parser is bad and wrong
in_art = None
n_art = 0
text = None
for (i, line) in enumerate(fin):
    if i % 1000000 == 0:
        print(i)

    ret = re.match('( *)<([^>]*?)>', line)
    if ret:
        (ind, tag) = ret.groups()
        ind = len(ind)
        body = line[ret.end():]
        ret = re.match('([^<]*?)</[^>]*?>', body)

Example #4

0

Show file

File: cluster.py Project: Ifeoluwa/newspepa

def similar(seq1, seq2):
    return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() > 0.9

Example #5

0

Show file

def compare_samples(base_dir,
                    trg_dir,
                    trg_to_base_name=lambda x: x,
                    opts=None):
    """Report on differences between samples in base and target directories.
  The trg_to_base_name fn takes a target file name and returns the source
  file name to use in the comparisons."""

    if not os.path.isdir(base_dir):
        print('Original sample dir \'%s\' does not exist' % base_dir)
        return
    if not os.path.isdir(trg_dir):
        print('New sample dir \'%s\' does not exist' % trg_dir)
        return

    print('Base (current) dir: %s' % base_dir)
    print('Target (new) dir: %s' % trg_dir)
    print('[a/b] means "a" in base is replaced with "b" in target')

    show_missing = opts and 'missing' in opts
    show_diffs = opts and 'diffs' in opts

    for trg_name in os.listdir(trg_dir):
        if trg_name == 'attributions.txt':
            continue

        trg_path = os.path.join(trg_dir, trg_name)
        if not (os.path.isfile(trg_path) and trg_name.endswith('.txt')):
            continue

        base_name = trg_to_base_name(trg_name)
        base_path = os.path.join(base_dir, base_name)
        if not os.path.exists(base_path):
            if show_missing:
                print('base does not exist: %s' % base_name)
            continue

        base_text = None
        dst_text = None
        with codecs.open(base_path, 'r', 'utf8') as f:
            base_text = f.read()
        with codecs.open(trg_path, 'r', 'utf8') as f:
            trg_text = f.read()
        if not base_text:
            print('base text (%s) is empty' % k)
            continue
        if not trg_text:
            print('target text is empty: %s' % trg_path)
            continue
        if base_text.find(trg_text) == -1:
            print('target (%s) text not in base (%s)' % (base_name, trg_name))
            if show_diffs:
                # In scripts that use space for word break it might be better to compare
                # word by word, but this suffices.
                sm = difflib.SequenceMatcher(None,
                                             base_text,
                                             trg_text,
                                             autojunk=False)
                lines = []
                for tag, i1, i2, j1, j2 in sm.get_opcodes():
                    if tag == 'delete':
                        lines.append('[%s/]' % base_text[i1:i2])
                    elif tag == 'equal':
                        lines.append(base_text[i1:i2])
                    elif tag == 'insert':
                        lines.append('[/%s]' % trg_text[j1:j2])
                    else:
                        lines.append('[%s/%s]' %
                                     (base_text[i1:i2], trg_text[j1:j2]))
                print(''.join(lines))

Example #6

0

Show file

File: byline.py Project: lucilucency/universitas.no

    def create(cls, full_byline, story, initials=''):
        """
        Creates new user or tries to find existing name in db
        args:
            full_byline: string of byline and creditline
            article: Article object (must be saved)
            initials: string
        returns:
            Byline object
        """
        byline_pattern = re.compile(
            # single word credit with colon. Person's name, Person's job title
            # or similiar description.
            # Example:
            # text: Jane Doe, Just a regular person
            r'^(?P<credit>[^:]+): (?P<full_name>[^,]+)\s*(, (?P<title>.+))?$',
            flags=re.UNICODE,
        )

        match = byline_pattern.match(full_byline)
        full_name = None
        try:
            d = match.groupdict()
            full_name = d['full_name'].title()
            title = d['title'] or ''
            credit = d['credit'].lower()
            initials = ''.join(
                letters[0] for letters in full_name.replace('-', ' ').split()
            )
            assert initials == initials.upper(
            ), 'All names should be capitalised'
            assert len(
                initials
            ) <= 5, 'Five names probably means something is wrong.'
            if len(initials) == 1:
                initials = full_name.upper()

        except (
            AssertionError,
            AttributeError,
        ) as e:
            # Malformed byline
            p_org = w_org = ' -- '
            if story.legacy_prodsys_source:
                dump = story.legacy_prodsys_source
                tekst = json.loads(dump)[0]['fields']['tekst']
                p_org = needle_in_haystack(full_byline, tekst)
            if story.legacy_html_source:
                dump = story.legacy_html_source
                w_org = json.loads(dump)[0]['fields']['byline']

            warning = ((
                'Malformed byline: "{byline}" error: {error} id: {id}'
                ' p_id: {p_id}\n{p_org} | {w_org} '
            ).format(
                id=story.id,
                p_id=story.prodsak_id,
                # story=story,
                byline=full_byline,
                error=e,
                p_org=p_org,
                w_org=w_org,
            ))
            logger.warn(warning)
            story.comment += warning
            story.publication_status = story.STATUS_ERROR

            full_name = 'Nomen Nescio'
            title = full_byline
            initials = 'XX'
            credit = '???'

        for choice in cls.CREDIT_CHOICES:
            # Find correct credit.
            ratio = difflib.SequenceMatcher(
                None,
                choice[0],
                credit[:],
            ).ratio()
            if .4 > ratio > .8:
                logger.debug(choice[0], credit, ratio)
            if ratio > .8:
                credit = choice[0]
                break
        else:
            credit = cls.DEFAULT_CREDIT

        try:
            contributor, __ = Contributor.get_or_create(full_name, initials)
        except ValueError:  # multiple contributors found
            # TODO: reimplement this shit
            return False

        new_byline = cls(
            story=story,
            credit=credit,
            title=title[:200],
            contributor=contributor,
        )
        new_byline.save()

Example #7

0

Show file

def WordDiff(line1, line2, diff_params):
    """Returns blocks with positions indiciating word level diffs.

  Args:
    line1: string representing the left part of the diff
    line2: string representing the right part of the diff
    diff_params: return value of GetDiffParams

  Returns:
    A tuple (blocks, ratio) where:
      blocks: [(offset1, offset2, size), ...] such that
              line1[offset1:offset1+size] == line2[offset2:offset2+size]
              and the last block is always (len(line1), len(line2), 0)
      ratio: a float giving the diff ratio computed by SequenceMatcher.
  """
    match_expr, min_match_ratio, min_match_size, _ = diff_params
    exp = EXPRS[match_expr]
    # Strings may have been left undecoded up to now. Assume UTF-8.
    line1 = TryDecode(line1)
    line2 = TryDecode(line2)

    a = re.findall(exp, line1, re.U)
    b = re.findall(exp, line2, re.U)
    s = difflib.SequenceMatcher(None, a, b)
    matching_blocks = s.get_matching_blocks()
    ratio = s.ratio()
    # Don't show intra region diffs if both lines are too different and there is
    # more than one block of difference. If there is only one change then we
    # still show the intra region diff regardless of how different the blocks
    # are.
    # Note: We compare len(matching_blocks) with 3 because one block of change
    # results in 2 matching blocks. We add the one special block and we get 3
    # matching blocks per one block of change.
    if ratio < min_match_ratio and len(matching_blocks) > 3:
        return ([(0, 0, 0)], ratio)
    # For now convert to character level blocks because we already have
    # the code to deal with folding across lines for character blocks.
    # Create arrays lena an lenb which have cumulative word lengths
    # corresponding to word positions in a and b
    lena = []
    last = 0
    for w in a:
        lena.append(last)
        last += len(w)
    lenb = []
    last = 0
    for w in b:
        lenb.append(last)
        last += len(w)
    lena.append(len(line1))
    lenb.append(len(line2))
    # Convert to character blocks
    blocks = []
    for s1, s2, blen in matching_blocks[:-1]:
        apos = lena[s1]
        bpos = lenb[s2]
        block_len = lena[s1 + blen] - apos
        blocks.append((apos, bpos, block_len))
    # Recreate the special block.
    blocks.append((len(line1), len(line2), 0))
    # Filter any matching blocks which are smaller than the desired threshold.
    # We don't remove matching blocks with only a newline character as doing so
    # results in showing the matching newline character as non matching which
    # doesn't look good.
    blocks = FilterBlocks(
        blocks, lambda b:
        (b[2] >= min_match_size or line1[b[0]:b[0] + b[2]] == '\n'))
    return (blocks, ratio)

Example #8

0

Show file

File: 相似度匹配.py Project: wucaizi/-

def get_equal_rate_1(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).quick_ratio()

Example #9

0

Show file

def compare_str(str1, str2):
    """ Return similarity (0.0 to 1.0) between the two strings """
    return difflib.SequenceMatcher(None, str1, str2).ratio()

Example #10

0

Show file

File: evaluate.py Project: ellschmidt/SyntacticParsingMIE

precision_counter = 0.0
predictions_counter = 0.0

for line_a in fileinput.input():
	file_name = os.path.splitext(fileinput.filename())[0]
	
original = open(file_name + " (2).a2", "r")

for line_origin in original:
	if re.search(r'OntoBiotope',line_origin):
		continue
	stripped_line_origin = line_origin.replace(re.search(r'(T\d*\s*)', line_origin).group(1),'')
	recall_counter += 1
	prediction = open(file_name + ".a2", "r")
	
	for line_pred in prediction:
		stripped_line_pred = line_pred.replace(re.search(r'(T\d*\s*)', line_pred).group(1),'')
		predictions_counter += 1

		if difflib.SequenceMatcher(None, stripped_line_origin, stripped_line_pred).ratio() >= 0.6:
			correct_count += 1

precision_counter = predictions_counter / recall_counter
recall = correct_count / recall_counter
print recall
precision = correct_count / precision_counter
print precision
f_one = 2*((precision*recall)/(precision+recall))
			
keeping_score = open("Scoretable.txt", "a")
keeping_score.write(file_name + "\t\t" + str(correct_count) + "\t\t" + str(round(recall,3)) + "\t\t" + str(round(precision,3)) + "\t\t" + str(round(f_one,3)) + "\n")

Example #11

0

Show file

File: preprocessing_script.py Project: bevanlewis/sms_analysis

def messenger_export_processing(file_name, my_name, special_removal=['']):
    list_of_lists=input_into_list(file_name)
    list_of_lists=[x for x in list_of_lists if not x in special_removal]
    date_format=determine_right_dateformat(list_of_lists)
    #try go find the first line with the date on
    print ("date_format for the file is ",date_format)
    date_flag=False
    for i in range(0, int(len(list_of_lists)/2)):
        try:                
            date_object=datetime.strptime(list_of_lists[i], date_format)
            date_flag=True
            break
        except:
            i
            
    if date_flag==True and i < int(len(list_of_lists)/2)-1:
        list_of_lists=list_of_lists[i:len(list_of_lists)]
    else:
        print ("something is wrong with the file")

    #first find the partner name
    ctr = collections.Counter(list_of_lists[0:min(2000, len(list_of_lists))])
    names=ctr.most_common(2)
    potential_names=[x[0] for x in names]
    scoring_similarlity=[difflib.SequenceMatcher(None,my_name,x).ratio() for x in potential_names]
    outgoing_name=potential_names[scoring_similarlity.index(max(scoring_similarlity))]
    incoming_name=[x for x in potential_names if not x==outgoing_name][0]
    
    cleaned_list_of_list=[]
    for i in range(0, len(list_of_lists)):
        check_item=list_of_lists[i]
        if check_item==outgoing_name:
            cleaned_list_of_list.append("Outgoing")
        elif check_item==incoming_name:
            cleaned_list_of_list.append("Incoming")
        else:
            try:
                datetime_object=datetime.strptime(check_item, date_format)
                cleaned_list_of_list.append(datetime_object)
            except:
                cleaned_list_of_list.append(check_item)
    
    i=0
    pd_conv=pd.DataFrame(columns=['Message Date','Type','Text'])
    print ("cleaning messanger data")
    while i < len(cleaned_list_of_list):
        x=cleaned_list_of_list[i]
        if isinstance(x, datetime):
            inserted_date=x.strftime('%Y-%m-%d %H:%M:%S')
            i=i+1
            if i<len(cleaned_list_of_list):
                message_type=cleaned_list_of_list[i]
                i=i+1
                text=''
                while i<len(cleaned_list_of_list) and isinstance(cleaned_list_of_list[i], datetime)==False:
                    text=text+cleaned_list_of_list[i]
                    i=i+1
                pd_conv=pd_conv.append({'Message Date':inserted_date, 'Type': message_type, 'Text': text}, ignore_index=True)
        else:
            i=i+1
            #print ("first line: ",cleaned_list_of_list[i], " is not a date")
           
    pd_conv=pd_conv.sort_values(by=['Message Date'])
    pd_conv=pd_conv.reset_index(drop=True)
    file_name=file_name.replace(".txt", ".csv")
    pd_conv.to_csv(file_name)
    return pd_conv

Example #12

0

Show file

def string_similar(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).quick_ratio()

Example #13

0

Show file

File: dispatch.py Project: pombredanne/hg

def _getsimilar(symbols, value):
    sim = lambda x: difflib.SequenceMatcher(None, value, x).ratio()
    # The cutoff for similarity here is pretty arbitrary. It should
    # probably be investigated and tweaked.
    return [s for s in symbols if sim(s) > 0.6]

Example #14

0

Show file

 def MatchedWithBenchmarkInputNameScore(benchmark_class):
   return difflib.SequenceMatcher(
       isjunk=None,
       a=benchmark_class.Name(), b=input_benchmark_name).ratio()

Example #15

0

Show file

File: add_parent.py Project: dfischer/rainbowforth

def StringSimilarity(a, b):
  return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()

Example #16

0

Show file

File: textstatistics.py Project: Goozzz/frequencies_decode

 def compare(self, word1, word2):
     return difflib.SequenceMatcher(None, word1, word2).ratio()

Example #17

0

Show file

File: atis_semantic_parser.py Project: r3dir3ct/allennlp-semparse

    def forward(
        self,  # type: ignore
        utterance: Dict[str, torch.LongTensor],
        world: List[AtisWorld],
        actions: List[List[ProductionRule]],
        linking_scores: torch.Tensor,
        target_action_sequence: torch.LongTensor = None,
        sql_queries: List[List[str]] = None,
    ) -> Dict[str, torch.Tensor]:
        """
        We set up the initial state for the decoder, and pass that state off to either a DecoderTrainer,
        if we're training, or a BeamSearch for inference, if we're not.

        Parameters
        ----------
        utterance : Dict[str, torch.LongTensor]
            The output of ``TextField.as_array()`` applied on the utterance ``TextField``. This will
            be passed through a ``TextFieldEmbedder`` and then through an encoder.
        world : ``List[AtisWorld]``
            We use a ``MetadataField`` to get the ``World`` for each input instance.  Because of
            how ``MetadataField`` works, this gets passed to us as a ``List[AtisWorld]``,
        actions : ``List[List[ProductionRule]]``
            A list of all possible actions for each ``World`` in the batch, indexed into a
            ``ProductionRule`` using a ``ProductionRuleField``.  We will embed all of these
            and use the embeddings to determine which action to take at each timestep in the
            decoder.
        linking_scores: ``torch.Tensor``
            A matrix of the linking the utterance tokens and the entities. This is a binary matrix that
            is deterministically generated where each entry indicates whether a token generated an entity.
            This tensor has shape ``(batch_size, num_entities, num_utterance_tokens)``.
        target_action_sequence : torch.Tensor, optional (default=None)
            The action sequence for the correct action sequence, where each action is an index into the list
            of possible actions.  This tensor has shape ``(batch_size, sequence_length, 1)``. We remove the
            trailing dimension.
        sql_queries : List[List[str]], optional (default=None)
            A list of the SQL queries that are given during training or validation.
        """
        initial_state = self._get_initial_state(utterance, world, actions,
                                                linking_scores)
        batch_size = linking_scores.shape[0]
        if target_action_sequence is not None:
            # Remove the trailing dimension (from ListField[ListField[IndexField]]).
            target_action_sequence = target_action_sequence.squeeze(-1)
            target_mask = target_action_sequence != self._action_padding_index
        else:
            target_mask = None

        if self.training:
            # target_action_sequence is of shape (batch_size, 1, sequence_length) here after we unsqueeze it for
            # the MML trainer.
            return self._decoder_trainer.decode(
                initial_state,
                self._transition_function,
                (target_action_sequence.unsqueeze(1),
                 target_mask.unsqueeze(1)),
            )
        else:
            # TODO(kevin) Move some of this functionality to a separate method for computing validation outputs.
            action_mapping = {}
            for batch_index, batch_actions in enumerate(actions):
                for action_index, action in enumerate(batch_actions):
                    action_mapping[(batch_index, action_index)] = action[0]
            outputs: Dict[str, Any] = {"action_mapping": action_mapping}
            outputs["linking_scores"] = linking_scores
            if target_action_sequence is not None:
                outputs["loss"] = self._decoder_trainer.decode(
                    initial_state,
                    self._transition_function,
                    (target_action_sequence.unsqueeze(1),
                     target_mask.unsqueeze(1)),
                )["loss"]
            num_steps = self._max_decoding_steps
            # This tells the state to start keeping track of debug info, which we'll pass along in
            # our output dictionary.
            initial_state.debug_info = [[] for _ in range(batch_size)]
            best_final_states = self._beam_search.search(
                num_steps,
                initial_state,
                self._transition_function,
                keep_final_unfinished_states=False,
            )
            outputs["best_action_sequence"] = []
            outputs["debug_info"] = []
            outputs["entities"] = []
            outputs["predicted_sql_query"] = []
            outputs["sql_queries"] = []
            outputs["utterance"] = []
            outputs["tokenized_utterance"] = []

            for i in range(batch_size):
                # Decoding may not have terminated with any completed valid SQL queries, if `num_steps`
                # isn't long enough (or if the model is not trained enough and gets into an
                # infinite action loop).
                if i not in best_final_states:
                    self._exact_match(0)
                    self._denotation_accuracy(0)
                    self._valid_sql_query(0)
                    self._action_similarity(0)
                    outputs["predicted_sql_query"].append("")
                    continue

                best_action_indices = best_final_states[i][0].action_history[0]

                action_strings = [
                    action_mapping[(i, action_index)]
                    for action_index in best_action_indices
                ]
                predicted_sql_query = action_sequence_to_sql(action_strings)

                if target_action_sequence is not None:
                    # Use a Tensor, not a Variable, to avoid a memory leak.
                    targets = target_action_sequence[i].data
                    sequence_in_targets = 0
                    sequence_in_targets = self._action_history_match(
                        best_action_indices, targets)
                    self._exact_match(sequence_in_targets)

                    similarity = difflib.SequenceMatcher(
                        None, best_action_indices, targets)
                    self._action_similarity(similarity.ratio())

                if sql_queries and sql_queries[i]:
                    denotation_correct = self._executor.evaluate_sql_query(
                        predicted_sql_query, sql_queries[i])
                    self._denotation_accuracy(denotation_correct)
                    outputs["sql_queries"].append(sql_queries[i])

                outputs["utterance"].append(world[i].utterances[-1])
                outputs["tokenized_utterance"].append([
                    token.text for token in world[i].tokenized_utterances[-1]
                ])
                outputs["entities"].append(world[i].entities)
                outputs["best_action_sequence"].append(action_strings)
                outputs["predicted_sql_query"].append(
                    sqlparse.format(predicted_sql_query, reindent=True))
                outputs["debug_info"].append(
                    best_final_states[i][0].debug_info[0])  # type: ignore
            return outputs

Example #18

0

Show file

File: as3.py Project: TristanRandall21/231

def similarity(L1, L2):
    matcher = difflib.SequenceMatcher(None, L1, L2)
    return matcher.ratio()

Example #19

0

Show file

File: index1.py Project: swornimshah2017/plagiarism

# print(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)) #[0:1] get the first row of the sparse matrix

container = cosine_similarity(
    tfidf_matrix[0:1],
    tfidf_matrix)  #compare first with every body  (first,everybody)
print(len(container[0]))

similarityScore = []
for each in container[0]:
    similarityScore.append(each)

# print(similarityScore.sort())
# print(similarityScore)
print('program ended success')

weighted_results = []
for result in filteredPosts:
    ratio = difflib.SequenceMatcher(None, result, textToCheck[0]).ratio()
    weighted_results.append((result, ratio))

# print(weighted_results)
# print(sorted(weighted_results, key=lambda x: x[1]))
# print(sorted(weighted_results))
# print('last data is ',weighted_results[0])

#filter data whose score is>=50%

for each in weighted_results:
    if ((each[1] * 100) > 40):
        print(each)

Example #20

0

Show file

File: difflib_ratio_test.py Project: kovacsgabor55/hadifogoly-adatbazis

import difflib
import sys

# az alábbi szerint a 'difflib' és a 'Levenshtein' kb. ua
# https://stackoverflow.com/questions/6690739

word1 = sys.argv[1]
word2 = sys.argv[2]

score = difflib.SequenceMatcher(None, word1, word2).ratio()

print(f"{word1}	{word2}	{score}")

Example #21

0

Show file

File: diff.py Project: dlobba/parsewiki

def diff(fseq1, fseq2):
    """Use difflib to compute differences of two
    sequences of strings."""
    differ = difflib.SequenceMatcher(a=fseq1, b=fseq2, autojunk=False)
    return differ.get_opcodes()

Example #22

0

Show file

File: classificate.py Project: QuickSwing/v6

def getSimilarity(w1, w2):
    seq = difflib.SequenceMatcher(None, w1, w2)
    d = seq.ratio() * 100
    return d

Example #23

0

Show file

File: comp_ref.py Project: bkepecs/PLOP_scripts

def rmsdRef():
    lines = []
    with open('v92.finalResult', 'r') as f:
        lines = f.readlines()[1:]
    seeds = []
    energies = []
    nativeRMSDs = []
    for line in lines:
        terms = line.split()
        seeds.append(int(terms[0]))
        energies.append(float(terms[8]))
        nativeRMSDs.append(terms[9])
    energies, seeds, nativeRMSDs = (list(t) for t in \
    zip(*sorted(zip(energies, seeds, nativeRMSDs))))

    cwd = os.getcwd()

    pattern = None
    with open('v92.con', 'r') as f:
        conLines = f.readlines()
    for line in conLines:
        if 'subjob_control' in line:
            terms = line.split()
            pattern = terms[2]

    structs = []
    for i in range(len(seeds)):
        for dir in os.listdir(os.path.join(cwd, 'subJobs')):
            if dir.split('_')[0] == str(seeds[i]):
                os.chdir(os.path.join(cwd, 'subJobs', dir))
                if 'plop.stdout' in os.listdir('.'):
                    stName = '4KUZ-p' + str(
                        pattern) + '-' + nativeRMSDs[i] + '_template.maegz'
                    structs.append(next(structure.StructureReader(stName)))
                os.chdir(cwd)
    minStruct = copy.deepcopy(structs[0])

    ALLINDICES = analyze.evaluate_asl(minStruct, ALLINDICES_asl)
    LOOPENVINDICES = analyze.evaluate_asl(minStruct, LOOPENVINDICES_asl)
    NONLOOPINDICES = analyze.evaluate_asl(minStruct, NONLOOPINDICES_asl)

    rmsds = []
    for i in range(0, len(structs)):
        curStruct = structs[i]
        rmsd.superimpose(minStruct, NONLOOPINDICES, curStruct, NONLOOPINDICES)
        RMSD = rmsd.calculate_in_place_rmsd(minStruct, LOOPENVINDICES,
                                            curStruct, LOOPENVINDICES)
        rmsds.append(RMSD)

    # What about Hbond patterns?
    hbonds = []
    for i in range(0, len(structs)):
        curStruct = structs[i]
        hbonds.append(hbond.get_hydrogen_bonds(curStruct, LOOPENVINDICES))

    hbondIndices = []
    for i in range(0, len(hbonds)):
        structIndices = []
        hbondIndices.append(structIndices)
        for j in range(0, len(hbonds[i])):
            pairIndices = []
            hbondIndices[i].append(pairIndices)
            for k in range(0, 2):
                hbondIndices[i][j].append(hbonds[i][j][k].index)

    min_hb_indices = copy.deepcopy(hbondIndices[0])
    hbond_overlaps = []
    for i in range(0, len(hbondIndices)):
        li1 = [tuple(lst) for lst in min_hb_indices]
        li2 = [tuple(lst) for lst in hbondIndices[i]]

        overlap = []
        for pair in li1:
            if pair in li2:
                overlap.append(pair)
        sm = difflib.SequenceMatcher(None, li1, li2)
        hbond_overlaps.append(round(sm.ratio(), 5))

    # What about salt bridge interactions?
    bridges = []
    for i in range(0, len(structs)):
        curStruct = structs[i]
        bridges.append(salt_bridge.get_salt_bridges(curStruct, LOOPENVINDICES))

    bridgeIndices = []
    for i in range(0, len(bridges)):
        structIndices = []
        bridgeIndices.append(structIndices)
        for j in range(0, len(bridges[i])):
            pairIndices = []
            bridgeIndices[i].append(pairIndices)
            for k in range(0, 2):
                bridgeIndices[i][j].append(bridges[i][j][k].index)

    min_bridge_indices = copy.deepcopy(bridgeIndices[0])
    salt_bridge_overlaps = []
    for i in range(0, len(bridgeIndices)):
        li1 = [tuple(lst) for lst in min_bridge_indices]
        li2 = [tuple(lst) for lst in bridgeIndices[i]]

        overlap = []
        for pair in li1:
            if pair in li2:
                overlap.append(pair)
        sm = difflib.SequenceMatcher(None, li1, li2)
        salt_bridge_overlaps.append(round(sm.ratio(), 5))

    # Hydrophobic interactions

    print('SEED\t\tRMSD\t\tHBOND_OVERLAP\tSALTBR_OVERLAP\tENERGY')
    for i in range(0, len(rmsds)):
        print(
            str(seeds[i]) + '\t\t' + str(round(rmsds[i], 3)) + '\t\t' +
            str(hbond_overlaps[i] * 100) + '\t\t' +
            str(salt_bridge_overlaps[i] * 100) + '\t\t' + str(energies[i]))

Example #24

0

Show file

File: train_accuracy.py Project: hockeymonday/Machine-Translation-NLP-Classifier

    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}

    # list of tuples with word and part of speech in each tuple, len(sent1) tuples
    s1 = nltk.pos_tag(nltk.word_tokenize(sent1))
    s1_pos = []
    for tup in s1:
        s1_pos.append(tup[1])

    s2 = nltk.pos_tag(nltk.word_tokenize(sent2))
    s2_pos = []
    for tup in s2:
        s2_pos.append(tup[1])

    # the parts of speech of one translation matched with another with smart sequence matcher
    sm = difflib.SequenceMatcher(None, s1_pos, s2_pos)
    res = sm.ratio()

    machine_or_no_int.append(int(machine_or_no[i]))

    # Simmilarity between the sentences using synsets...
    s1 = dict(
        filter(
            lambda x: len(x[1]) > 0,
            map(
                lambda row: (row[0], wn.synsets(row[0], tag_dict[row[1][0]]))
                if row[1][0] in tag_dict.keys() else (row[0], []), s1)))

    s2 = nltk.pos_tag(nltk.word_tokenize(sent2))

    s2 = dict(

Example #25

0

Show file

File: docxrstrgood.py Project: twinwave-security/twinclams

def longest(a, b):
    match = difflib.SequenceMatcher(None, a, b)
    m = match.find_longest_match(0, len(a), 0, len(b))
    return a[m.a:m.a + m.size]

Example #26

0

Show file

File: ec602lib.py Project: czheng78/Design-By-Software

def overallcpp(program_name,testclass,refcode,program=None,orig_program=None,lintoptions=STDLINT,compile=True):
    if not orig_program:
        orig_program = program_name
    s = 'Checking {} for EC602 submission.\n'.format(orig_program)
    if not program:
        program=program_name[:-4]
    
    try:
        f=open(program_name)
        the_program = f.read()
        f.close()
    except:
        s += 'The program {} does not exist here.\n'.format(orig_program)
        return 'No file',s

    authors = get_authors(the_program,progtype(program_name))

    includes = get_includes(the_program)
    s += '\n---- analysis of your code structure ----\n\n'

    s += 'authors       : {}\n'.format(" ".join(authors) if authors else AUTHWARN)

    s += 'included libs : {}\n'.format(" ".join(includes))

    if compile:
        C = subprocess.run(['g++','-std=c++14',program_name, '-o', program], stderr=subprocess.PIPE)
        print(C)
        s += 'compile       : {}\n'.format("error" if C.returncode else "ok")

    comments = 0
    for line in the_program.splitlines():
       if '//' in line:
        comments += 1

    P_astyle = subprocess.run(['astyle',
           *ASTYLE_OPTIONS,program_name],
           stdout=subprocess.PIPE,stderr=subprocess.PIPE)

    if P_astyle.returncode:
        s += 'astyle     : error {}'.format(P_astyle.stderr.decode())


    unchanged = 1
    if P_astyle.stdout.decode().startswith('Formatted'):
        Original = open(program_name+".orig").readlines()
        Newprog = open(program_name).readlines()
        m = difflib.SequenceMatcher()
        m.set_seqs(Original,Newprog)
        unchanged = m.ratio()

    s += "astyle        : {:.1%} code unchanged.\n".format(unchanged)

    cpplint_call_list = ['cpplint','--filter='+','.join(lintoptions),program_name]

    P_lint = subprocess.run(cpplint_call_list, stderr=subprocess.PIPE)

    prob=False
    if P_lint.returncode:
        prob = P_lint.stderr.decode().rsplit(" ",1)[-1].strip()
    
    s += "cpplint       : {}\n".format("{} problems".format(prob) if prob else "ok")
    cpplint_call_list = ['cpplint','--filter='+','.join(lintoptions),orig_program]

    s += ' [{}]\n'.format(' '.join(cpplint_call_list))


    CA = code_analysis_cpp(program_name)
    s += "lines of code : {}, {:4.0%} of reference\n".format(CA['lines'],CA['lines']/refcode['lines'])
    s += "tokens in code: {}, {:4.0%} of reference\n".format(CA['words'],CA['words']/refcode['words'])
    s += "comments      : {}\n".format(comments)


    s += '\n---- check of requirements ----\n'
    try:
        errors,passed,gradesummary = check_program(testclass)
    except unittest.SkipTest as e:
        s+= str(e)
        return "Errors",s,{'pass':[],'fail':[]}

    for p in passed:
        s += p

    if errors:
        s += '-----------------errors found--------------\n'
        for e in errors:
            s += e + "\n-------\n"


    if errors:
        return 'Errors',s,gradesummary
    else:
        return 'Pass',s,gradesummary

Example #27

0

Show file

File: behavior_action_server.py Project: machinekoder/flexbe_behavior_engine

    def _execute_cb(self, goal):
        rospy.loginfo('Received a new request to start behavior: %s' %
                      goal.behavior_name)
        be_id, behavior = self._behavior_lib.find_behavior(goal.behavior_name)
        if be_id is None:
            Logger.logerr("Did not find behavior with requested name: %s" %
                          goal.behavior_name)
            self._as.set_preempted()
            return

        be_selection = BehaviorSelection()
        be_selection.behavior_id = be_id
        be_selection.autonomy_level = 255
        try:
            for k, v in zip(goal.arg_keys, goal.arg_values):
                if v.startswith('file://'):
                    v = v.replace('file://', '', 1)
                    path = v.split(':')[0]
                    if len(v.split(':')) > 1:
                        ns = v.split(':')[1]
                    else:
                        ns = ''
                    if path.startswith('~') or path.startswith('/'):
                        filepath = os.path.expanduser(path)
                    else:
                        filepath = os.path.join(
                            self._rp.get_path(path.split('/')[0]),
                            '/'.join(path.split('/')[1:]))
                    with open(filepath, 'r') as f:
                        content = f.read()
                    if ns != '':
                        content = yaml.load(content)
                        if ns in content:
                            content = content[ns]
                        content = yaml.dump(content)
                    be_selection.arg_keys.append(k)
                    be_selection.arg_values.append(content)
                else:
                    be_selection.arg_keys.append(k)
                    be_selection.arg_values.append(v)
        except Exception as e:
            rospy.logwarn(
                'Failed to parse and substitute behavior arguments, will use direct input.\n%s'
                % str(e))
            be_selection.arg_keys = goal.arg_keys
            be_selection.arg_values = goal.arg_values
        be_selection.input_keys = goal.input_keys
        be_selection.input_values = goal.input_values

        # check for local modifications of the behavior to send them to the onboard behavior
        be_filepath_new = self._behavior_lib.get_sourcecode_filepath(be_id)
        with open(be_filepath_new, "r") as f:
            be_content_new = f.read()

        be_filepath_old = self._behavior_lib.get_sourcecode_filepath(
            be_id, add_tmp=True)
        if not os.path.isfile(be_filepath_old):
            be_selection.behavior_checksum = zlib.adler32(be_content_new)
        else:
            with open(be_filepath_old, "r") as f:
                be_content_old = f.read()

            sqm = difflib.SequenceMatcher(a=be_content_old, b=be_content_new)
            diffs = [x[1] for x in sqm.get_grouped_opcodes(0)]
            for opcode, a0, a1, b0, b1 in diffs:
                content = be_content_new[b0:b1]
                be_selection.modifications.append(
                    BehaviorModification(a0, a1, content))

            be_selection.behavior_checksum = zlib.adler32(be_content_new)

        # reset state before starting new behavior
        self._engine_status = None
        self._current_state = None
        self._behavior_started = False

        # start new behavior
        self._pub.publish(be_selection)

        try:
            rate = rospy.Rate(10)
            while not rospy.is_shutdown():
                if self._current_state is not None:
                    self._as.publish_feedback(
                        BehaviorExecutionFeedback(self._current_state))
                    self._current_state = None

                # check if goal has been preempted first
                if self._as.is_preempt_requested():
                    rospy.loginfo('Behavior execution preempt requested!')
                    self._preempt_pub.publish()
                    rate.sleep()
                    self._as.set_preempted('')
                    break

                if self._engine_status is None:
                    rospy.logdebug_throttle(
                        1,
                        'No behavior engine status received yet. Waiting for it...'
                    )
                    rate.sleep()
                    continue

                if self._engine_status.code == BEStatus.ERROR:
                    rospy.logerr(
                        'Failed to run behavior! Check onboard terminal for further infos.'
                    )
                    rate.sleep()
                    self._as.set_aborted('')
                    break

                if not self._behavior_started:
                    rospy.logdebug_throttle(
                        1,
                        'Behavior execution has not yet started. Waiting for it...'
                    )
                    rate.sleep()
                    continue

                if self._engine_status.code == BEStatus.FINISHED:
                    result = self._engine_status.args[0] \
                     if len(self._engine_status.args) >= 1 else ''
                    rospy.loginfo(
                        'Finished behavior execution with result "%s"!' %
                        result)
                    self._as.set_succeeded(
                        BehaviorExecutionResult(outcome=result))
                    break

                if self._engine_status.code == BEStatus.FAILED:
                    rospy.logerr('Behavior execution failed in state %s!' %
                                 str(self._current_state))
                    rate.sleep()
                    self._as.set_aborted('')
                    break

                rate.sleep()

            rospy.loginfo('Ready for next behavior start request.')

        except rospy.ROSInterruptException:
            pass  # allow clean exit on ROS shutdown

Example #28

0

Show file

File: util.py Project: dlb666666/easySql

def quickRatio(u1, u2):
    rawRequest = getContent(u1)
    checkWafRequest = getContent(u2)
    retVal = difflib.SequenceMatcher(None, rawRequest, checkWafRequest).quick_ratio()
    return retVal

Example #29

0

Show file

def diff_text(a, b):
    """
    Performs a diffing algorithm on two pieces of text. Returns
    a string of HTML containing the content of both texts with
    <span> tags inserted indicating where the differences are.
    """
    def tokenise(text):
        """
        Tokenises a string by splitting it into individual characters
        and grouping the alphanumeric ones together.

        This means that punctuation, whitespace, CJK characters, etc
        become separate tokens and words/numbers are merged together
        to form bigger tokens.

        This makes the output of the diff easier to read as words are
        not broken up.
        """
        tokens = []
        current_token = ""

        for c in text or "":
            if c.isalnum():
                current_token += c
            else:
                if current_token:
                    tokens.append(current_token)
                    current_token = ""

                tokens.append(c)

        if current_token:
            tokens.append(current_token)

        return tokens

    a_tok = tokenise(a)
    b_tok = tokenise(b)
    sm = difflib.SequenceMatcher(lambda t: len(t) <= 4, a_tok, b_tok)

    changes = []

    for op, i1, i2, j1, j2 in sm.get_opcodes():
        if op == "replace":
            for token in a_tok[i1:i2]:
                changes.append(("deletion", token))
            for token in b_tok[j1:j2]:
                changes.append(("addition", token))
        elif op == "delete":
            for token in a_tok[i1:i2]:
                changes.append(("deletion", token))
        elif op == "insert":
            for token in b_tok[j1:j2]:
                changes.append(("addition", token))
        elif op == "equal":
            for token in a_tok[i1:i2]:
                changes.append(("equal", token))

    # Merge adjacent changes which have the same type. This just cleans up the HTML a bit
    merged_changes = []
    current_value = []
    current_change_type = None
    for change_type, value in changes:
        if change_type != current_change_type:
            if current_change_type is not None:
                merged_changes.append(
                    (current_change_type, "".join(current_value)))
                current_value = []

            current_change_type = change_type

        current_value.append(value)

    if current_value:
        merged_changes.append((current_change_type, "".join(current_value)))

    return TextDiff(merged_changes)

Example #30

0

Show file

File: quiz.py Project: jkulba/quiz


try:
    questions = get_questions()
except IOError as e:
    print 'Error reading questions file %s' % e
    sys.exit()
except IndexError:
    print 'Error: all questions in the questions file must have answers.'
    sys.exit()

score = 0
total = len(questions)
for question, answer in questions:
    guesses = 1
    correct = 'no'

    while guesses < 4 and correct == 'no':
        guess = raw_input(question.strip() + ' (Guess %s)\n' % guesses)
        q = difflib.SequenceMatcher(None, guess, answer)
        # print round(q.ratio()*100, 1)
        if round(q.ratio() * 100, 1) == 100:
            print '---CORRECT---'
            score += 1
            correct = 'yes'
        else:
            print '---WRONG---'
        guesses += 1

print 'You got %s out of %s questions right' % (score, total)