def get_difference_between_two_pages(page1, page2): s = difflib.SequenceMatcher(None, page1, page2) return s.ratio()
def cse(exprs, symbols=None, optimizations=None, postprocess=None): """ Perform common subexpression elimination on an expression. Parameters ========== exprs : list of sympy expressions, or a single sympy expression The expressions to reduce. symbols : infinite iterator yielding unique Symbols The symbols used to label the common subexpressions which are pulled out. The ``numbered_symbols`` generator is useful. The default is a stream of symbols of the form "x0", "x1", etc. This must be an infinite iterator. optimizations : list of (callable, callable) pairs, optional The (preprocessor, postprocessor) pairs. If not provided, ``sympy.simplify.cse.cse_optimizations`` is used. postprocess : a function which accepts the two return values of cse and returns the desired form of output from cse, e.g. if you want the replacements reversed the function might be lambda r, e: return reversed(r), e Returns ======= replacements : list of (Symbol, expression) pairs All of the common subexpressions that were replaced. Subexpressions earlier in this list might show up in subexpressions later in this list. reduced_exprs : list of sympy expressions The reduced expressions with all of the replacements above. """ from sympy.matrices import Matrix from sympy.simplify.simplify import fraction if symbols is None: symbols = numbered_symbols() else: # In case we get passed an iterable with an __iter__ method instead of # an actual iterator. symbols = iter(symbols) seen_subexp = set() muls = set() adds = set() to_eliminate = [] to_eliminate_ops_count = [] if optimizations is None: # Pull out the default here just in case there are some weird # manipulations of the module-level list in some other thread. optimizations = list(cse_optimizations) # Handle the case if just one expression was passed. if isinstance(exprs, Basic): exprs = [exprs] # Preprocess the expressions to give us better optimization opportunities. reduced_exprs = [preprocess_for_cse(e, optimizations) for e in exprs] # Find all of the repeated subexpressions. def insert(subtree): '''This helper will insert the subtree into to_eliminate while maintaining the ordering by op count and will skip the insertion if subtree is already present.''' ops_count = (subtree.count_ops(), subtree.is_Mul ) # prefer non-Mul to Mul index_to_insert = bisect.bisect(to_eliminate_ops_count, ops_count) # all i up to this index have op count <= the current op count # so check that subtree is not yet present from this index down # (if necessary) to zero. for i in xrange(index_to_insert - 1, -1, -1): if to_eliminate_ops_count[i] == ops_count and \ subtree == to_eliminate[i]: return # already have it to_eliminate_ops_count.insert(index_to_insert, ops_count) to_eliminate.insert(index_to_insert, subtree) for expr in reduced_exprs: if not isinstance(expr, Basic): continue pt = preorder_traversal(expr) for subtree in pt: inv = 1 / subtree if subtree.is_Pow else None if subtree.is_Atom or iterable(subtree) or inv and inv.is_Atom: # Exclude atoms, since there is no point in renaming them. continue if subtree in seen_subexp: if inv and _coeff_isneg(subtree.exp): # save the form with positive exponent subtree = inv insert(subtree) pt.skip() continue if inv and inv in seen_subexp: if _coeff_isneg(subtree.exp): # save the form with positive exponent subtree = inv insert(subtree) pt.skip() continue elif subtree.is_Mul: muls.add(subtree) elif subtree.is_Add: adds.add(subtree) seen_subexp.add(subtree) # process adds - any adds that weren't repeated might contain # subpatterns that are repeated, e.g. x+y+z and x+y have x+y in common adds = [set(a.args) for a in adds] for i in xrange(len(adds)): for j in xrange(i + 1, len(adds)): com = adds[i].intersection(adds[j]) if len(com) > 1: insert(Add(*com)) # remove this set of symbols so it doesn't appear again adds[i] = adds[i].difference(com) adds[j] = adds[j].difference(com) for k in xrange(j + 1, len(adds)): if not com.difference(adds[k]): adds[k] = adds[k].difference(com) # process muls - any muls that weren't repeated might contain # subpatterns that are repeated, e.g. x*y*z and x*y have x*y in common # use SequenceMatcher on the nc part to find the longest common expression # in common between the two nc parts sm = difflib.SequenceMatcher() muls = [a.args_cnc(cset=True) for a in muls] for i in xrange(len(muls)): if muls[i][1]: sm.set_seq1(muls[i][1]) for j in xrange(i + 1, len(muls)): # the commutative part in common ccom = muls[i][0].intersection(muls[j][0]) # the non-commutative part in common if muls[i][1] and muls[j][1]: # see if there is any chance of an nc match ncom = set(muls[i][1]).intersection(set(muls[j][1])) if len(ccom) + len(ncom) < 2: continue # now work harder to find the match sm.set_seq2(muls[j][1]) i1, _, n = sm.find_longest_match(0, len(muls[i][1]), 0, len(muls[j][1])) ncom = muls[i][1][i1:i1 + n] else: ncom = [] com = list(ccom) + ncom if len(com) < 2: continue insert(Mul(*com)) # remove ccom from all if there was no ncom; to update the nc part # would require finding the subexpr and then replacing it with a # dummy to keep bounding nc symbols from being identified as a # subexpr, e.g. removing B*C from A*B*C*D might allow A*D to be # identified as a subexpr which would not be right. if not ncom: muls[i][0] = muls[i][0].difference(ccom) for k in xrange(j, len(muls)): if not ccom.difference(muls[k][0]): muls[k][0] = muls[k][0].difference(ccom) # Substitute symbols for all of the repeated subexpressions. replacements = [] reduced_exprs = list(reduced_exprs) hit = True for i, subtree in enumerate(to_eliminate): if hit: sym = symbols.next() hit = False if subtree.is_Pow and subtree.exp.is_Rational: update = lambda x: x.xreplace({subtree: sym, 1 / subtree: 1 / sym}) else: update = lambda x: x.subs(subtree, sym) # Make the substitution in all of the target expressions. for j, expr in enumerate(reduced_exprs): old = reduced_exprs[j] reduced_exprs[j] = update(expr) hit = hit or (old != reduced_exprs[j]) # Make the substitution in all of the subsequent substitutions. for j in range(i + 1, len(to_eliminate)): old = to_eliminate[j] to_eliminate[j] = update(to_eliminate[j]) hit = hit or (old != to_eliminate[j]) if hit: replacements.append((sym, subtree)) # Postprocess the expressions to return the expressions to canonical form. for i, (sym, subtree) in enumerate(replacements): subtree = postprocess_for_cse(subtree, optimizations) replacements[i] = (sym, subtree) reduced_exprs = [ postprocess_for_cse(e, optimizations) for e in reduced_exprs ] if isinstance(exprs, Matrix): reduced_exprs = [Matrix(exprs.rows, exprs.cols, reduced_exprs)] if postprocess is None: return replacements, reduced_exprs return postprocess(replacements, reduced_exprs)
wiki = parse_wiki(text) except Exception as e: print() print('PARSE ERROR') print(text) print() wiki = '' red = reduce_wiki(wiki) return red.split() # set up files fin = open(args.source, encoding='utf-8') fout = open(args.output, 'w', encoding='utf-8') # create differ sm = difflib.SequenceMatcher() # this parser is bad and wrong in_art = None n_art = 0 text = None for (i, line) in enumerate(fin): if i % 1000000 == 0: print(i) ret = re.match('( *)<([^>]*?)>', line) if ret: (ind, tag) = ret.groups() ind = len(ind) body = line[ret.end():] ret = re.match('([^<]*?)</[^>]*?>', body)
def similar(seq1, seq2): return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() > 0.9
def compare_samples(base_dir, trg_dir, trg_to_base_name=lambda x: x, opts=None): """Report on differences between samples in base and target directories. The trg_to_base_name fn takes a target file name and returns the source file name to use in the comparisons.""" if not os.path.isdir(base_dir): print('Original sample dir \'%s\' does not exist' % base_dir) return if not os.path.isdir(trg_dir): print('New sample dir \'%s\' does not exist' % trg_dir) return print('Base (current) dir: %s' % base_dir) print('Target (new) dir: %s' % trg_dir) print('[a/b] means "a" in base is replaced with "b" in target') show_missing = opts and 'missing' in opts show_diffs = opts and 'diffs' in opts for trg_name in os.listdir(trg_dir): if trg_name == 'attributions.txt': continue trg_path = os.path.join(trg_dir, trg_name) if not (os.path.isfile(trg_path) and trg_name.endswith('.txt')): continue base_name = trg_to_base_name(trg_name) base_path = os.path.join(base_dir, base_name) if not os.path.exists(base_path): if show_missing: print('base does not exist: %s' % base_name) continue base_text = None dst_text = None with codecs.open(base_path, 'r', 'utf8') as f: base_text = f.read() with codecs.open(trg_path, 'r', 'utf8') as f: trg_text = f.read() if not base_text: print('base text (%s) is empty' % k) continue if not trg_text: print('target text is empty: %s' % trg_path) continue if base_text.find(trg_text) == -1: print('target (%s) text not in base (%s)' % (base_name, trg_name)) if show_diffs: # In scripts that use space for word break it might be better to compare # word by word, but this suffices. sm = difflib.SequenceMatcher(None, base_text, trg_text, autojunk=False) lines = [] for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == 'delete': lines.append('[%s/]' % base_text[i1:i2]) elif tag == 'equal': lines.append(base_text[i1:i2]) elif tag == 'insert': lines.append('[/%s]' % trg_text[j1:j2]) else: lines.append('[%s/%s]' % (base_text[i1:i2], trg_text[j1:j2])) print(''.join(lines))
def create(cls, full_byline, story, initials=''): """ Creates new user or tries to find existing name in db args: full_byline: string of byline and creditline article: Article object (must be saved) initials: string returns: Byline object """ byline_pattern = re.compile( # single word credit with colon. Person's name, Person's job title # or similiar description. # Example: # text: Jane Doe, Just a regular person r'^(?P<credit>[^:]+): (?P<full_name>[^,]+)\s*(, (?P<title>.+))?$', flags=re.UNICODE, ) match = byline_pattern.match(full_byline) full_name = None try: d = match.groupdict() full_name = d['full_name'].title() title = d['title'] or '' credit = d['credit'].lower() initials = ''.join( letters[0] for letters in full_name.replace('-', ' ').split() ) assert initials == initials.upper( ), 'All names should be capitalised' assert len( initials ) <= 5, 'Five names probably means something is wrong.' if len(initials) == 1: initials = full_name.upper() except ( AssertionError, AttributeError, ) as e: # Malformed byline p_org = w_org = ' -- ' if story.legacy_prodsys_source: dump = story.legacy_prodsys_source tekst = json.loads(dump)[0]['fields']['tekst'] p_org = needle_in_haystack(full_byline, tekst) if story.legacy_html_source: dump = story.legacy_html_source w_org = json.loads(dump)[0]['fields']['byline'] warning = (( 'Malformed byline: "{byline}" error: {error} id: {id}' ' p_id: {p_id}\n{p_org} | {w_org} ' ).format( id=story.id, p_id=story.prodsak_id, # story=story, byline=full_byline, error=e, p_org=p_org, w_org=w_org, )) logger.warn(warning) story.comment += warning story.publication_status = story.STATUS_ERROR full_name = 'Nomen Nescio' title = full_byline initials = 'XX' credit = '???' for choice in cls.CREDIT_CHOICES: # Find correct credit. ratio = difflib.SequenceMatcher( None, choice[0], credit[:], ).ratio() if .4 > ratio > .8: logger.debug(choice[0], credit, ratio) if ratio > .8: credit = choice[0] break else: credit = cls.DEFAULT_CREDIT try: contributor, __ = Contributor.get_or_create(full_name, initials) except ValueError: # multiple contributors found # TODO: reimplement this shit return False new_byline = cls( story=story, credit=credit, title=title[:200], contributor=contributor, ) new_byline.save()
def WordDiff(line1, line2, diff_params): """Returns blocks with positions indiciating word level diffs. Args: line1: string representing the left part of the diff line2: string representing the right part of the diff diff_params: return value of GetDiffParams Returns: A tuple (blocks, ratio) where: blocks: [(offset1, offset2, size), ...] such that line1[offset1:offset1+size] == line2[offset2:offset2+size] and the last block is always (len(line1), len(line2), 0) ratio: a float giving the diff ratio computed by SequenceMatcher. """ match_expr, min_match_ratio, min_match_size, _ = diff_params exp = EXPRS[match_expr] # Strings may have been left undecoded up to now. Assume UTF-8. line1 = TryDecode(line1) line2 = TryDecode(line2) a = re.findall(exp, line1, re.U) b = re.findall(exp, line2, re.U) s = difflib.SequenceMatcher(None, a, b) matching_blocks = s.get_matching_blocks() ratio = s.ratio() # Don't show intra region diffs if both lines are too different and there is # more than one block of difference. If there is only one change then we # still show the intra region diff regardless of how different the blocks # are. # Note: We compare len(matching_blocks) with 3 because one block of change # results in 2 matching blocks. We add the one special block and we get 3 # matching blocks per one block of change. if ratio < min_match_ratio and len(matching_blocks) > 3: return ([(0, 0, 0)], ratio) # For now convert to character level blocks because we already have # the code to deal with folding across lines for character blocks. # Create arrays lena an lenb which have cumulative word lengths # corresponding to word positions in a and b lena = [] last = 0 for w in a: lena.append(last) last += len(w) lenb = [] last = 0 for w in b: lenb.append(last) last += len(w) lena.append(len(line1)) lenb.append(len(line2)) # Convert to character blocks blocks = [] for s1, s2, blen in matching_blocks[:-1]: apos = lena[s1] bpos = lenb[s2] block_len = lena[s1 + blen] - apos blocks.append((apos, bpos, block_len)) # Recreate the special block. blocks.append((len(line1), len(line2), 0)) # Filter any matching blocks which are smaller than the desired threshold. # We don't remove matching blocks with only a newline character as doing so # results in showing the matching newline character as non matching which # doesn't look good. blocks = FilterBlocks( blocks, lambda b: (b[2] >= min_match_size or line1[b[0]:b[0] + b[2]] == '\n')) return (blocks, ratio)
def get_equal_rate_1(str1, str2): return difflib.SequenceMatcher(None, str1, str2).quick_ratio()
def compare_str(str1, str2): """ Return similarity (0.0 to 1.0) between the two strings """ return difflib.SequenceMatcher(None, str1, str2).ratio()
precision_counter = 0.0 predictions_counter = 0.0 for line_a in fileinput.input(): file_name = os.path.splitext(fileinput.filename())[0] original = open(file_name + " (2).a2", "r") for line_origin in original: if re.search(r'OntoBiotope',line_origin): continue stripped_line_origin = line_origin.replace(re.search(r'(T\d*\s*)', line_origin).group(1),'') recall_counter += 1 prediction = open(file_name + ".a2", "r") for line_pred in prediction: stripped_line_pred = line_pred.replace(re.search(r'(T\d*\s*)', line_pred).group(1),'') predictions_counter += 1 if difflib.SequenceMatcher(None, stripped_line_origin, stripped_line_pred).ratio() >= 0.6: correct_count += 1 precision_counter = predictions_counter / recall_counter recall = correct_count / recall_counter print recall precision = correct_count / precision_counter print precision f_one = 2*((precision*recall)/(precision+recall)) keeping_score = open("Scoretable.txt", "a") keeping_score.write(file_name + "\t\t" + str(correct_count) + "\t\t" + str(round(recall,3)) + "\t\t" + str(round(precision,3)) + "\t\t" + str(round(f_one,3)) + "\n")
def messenger_export_processing(file_name, my_name, special_removal=['']): list_of_lists=input_into_list(file_name) list_of_lists=[x for x in list_of_lists if not x in special_removal] date_format=determine_right_dateformat(list_of_lists) #try go find the first line with the date on print ("date_format for the file is ",date_format) date_flag=False for i in range(0, int(len(list_of_lists)/2)): try: date_object=datetime.strptime(list_of_lists[i], date_format) date_flag=True break except: i if date_flag==True and i < int(len(list_of_lists)/2)-1: list_of_lists=list_of_lists[i:len(list_of_lists)] else: print ("something is wrong with the file") #first find the partner name ctr = collections.Counter(list_of_lists[0:min(2000, len(list_of_lists))]) names=ctr.most_common(2) potential_names=[x[0] for x in names] scoring_similarlity=[difflib.SequenceMatcher(None,my_name,x).ratio() for x in potential_names] outgoing_name=potential_names[scoring_similarlity.index(max(scoring_similarlity))] incoming_name=[x for x in potential_names if not x==outgoing_name][0] cleaned_list_of_list=[] for i in range(0, len(list_of_lists)): check_item=list_of_lists[i] if check_item==outgoing_name: cleaned_list_of_list.append("Outgoing") elif check_item==incoming_name: cleaned_list_of_list.append("Incoming") else: try: datetime_object=datetime.strptime(check_item, date_format) cleaned_list_of_list.append(datetime_object) except: cleaned_list_of_list.append(check_item) i=0 pd_conv=pd.DataFrame(columns=['Message Date','Type','Text']) print ("cleaning messanger data") while i < len(cleaned_list_of_list): x=cleaned_list_of_list[i] if isinstance(x, datetime): inserted_date=x.strftime('%Y-%m-%d %H:%M:%S') i=i+1 if i<len(cleaned_list_of_list): message_type=cleaned_list_of_list[i] i=i+1 text='' while i<len(cleaned_list_of_list) and isinstance(cleaned_list_of_list[i], datetime)==False: text=text+cleaned_list_of_list[i] i=i+1 pd_conv=pd_conv.append({'Message Date':inserted_date, 'Type': message_type, 'Text': text}, ignore_index=True) else: i=i+1 #print ("first line: ",cleaned_list_of_list[i], " is not a date") pd_conv=pd_conv.sort_values(by=['Message Date']) pd_conv=pd_conv.reset_index(drop=True) file_name=file_name.replace(".txt", ".csv") pd_conv.to_csv(file_name) return pd_conv
def string_similar(s1, s2): return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
def _getsimilar(symbols, value): sim = lambda x: difflib.SequenceMatcher(None, value, x).ratio() # The cutoff for similarity here is pretty arbitrary. It should # probably be investigated and tweaked. return [s for s in symbols if sim(s) > 0.6]
def MatchedWithBenchmarkInputNameScore(benchmark_class): return difflib.SequenceMatcher( isjunk=None, a=benchmark_class.Name(), b=input_benchmark_name).ratio()
def StringSimilarity(a, b): return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()
def compare(self, word1, word2): return difflib.SequenceMatcher(None, word1, word2).ratio()
def forward( self, # type: ignore utterance: Dict[str, torch.LongTensor], world: List[AtisWorld], actions: List[List[ProductionRule]], linking_scores: torch.Tensor, target_action_sequence: torch.LongTensor = None, sql_queries: List[List[str]] = None, ) -> Dict[str, torch.Tensor]: """ We set up the initial state for the decoder, and pass that state off to either a DecoderTrainer, if we're training, or a BeamSearch for inference, if we're not. Parameters ---------- utterance : Dict[str, torch.LongTensor] The output of ``TextField.as_array()`` applied on the utterance ``TextField``. This will be passed through a ``TextFieldEmbedder`` and then through an encoder. world : ``List[AtisWorld]`` We use a ``MetadataField`` to get the ``World`` for each input instance. Because of how ``MetadataField`` works, this gets passed to us as a ``List[AtisWorld]``, actions : ``List[List[ProductionRule]]`` A list of all possible actions for each ``World`` in the batch, indexed into a ``ProductionRule`` using a ``ProductionRuleField``. We will embed all of these and use the embeddings to determine which action to take at each timestep in the decoder. linking_scores: ``torch.Tensor`` A matrix of the linking the utterance tokens and the entities. This is a binary matrix that is deterministically generated where each entry indicates whether a token generated an entity. This tensor has shape ``(batch_size, num_entities, num_utterance_tokens)``. target_action_sequence : torch.Tensor, optional (default=None) The action sequence for the correct action sequence, where each action is an index into the list of possible actions. This tensor has shape ``(batch_size, sequence_length, 1)``. We remove the trailing dimension. sql_queries : List[List[str]], optional (default=None) A list of the SQL queries that are given during training or validation. """ initial_state = self._get_initial_state(utterance, world, actions, linking_scores) batch_size = linking_scores.shape[0] if target_action_sequence is not None: # Remove the trailing dimension (from ListField[ListField[IndexField]]). target_action_sequence = target_action_sequence.squeeze(-1) target_mask = target_action_sequence != self._action_padding_index else: target_mask = None if self.training: # target_action_sequence is of shape (batch_size, 1, sequence_length) here after we unsqueeze it for # the MML trainer. return self._decoder_trainer.decode( initial_state, self._transition_function, (target_action_sequence.unsqueeze(1), target_mask.unsqueeze(1)), ) else: # TODO(kevin) Move some of this functionality to a separate method for computing validation outputs. action_mapping = {} for batch_index, batch_actions in enumerate(actions): for action_index, action in enumerate(batch_actions): action_mapping[(batch_index, action_index)] = action[0] outputs: Dict[str, Any] = {"action_mapping": action_mapping} outputs["linking_scores"] = linking_scores if target_action_sequence is not None: outputs["loss"] = self._decoder_trainer.decode( initial_state, self._transition_function, (target_action_sequence.unsqueeze(1), target_mask.unsqueeze(1)), )["loss"] num_steps = self._max_decoding_steps # This tells the state to start keeping track of debug info, which we'll pass along in # our output dictionary. initial_state.debug_info = [[] for _ in range(batch_size)] best_final_states = self._beam_search.search( num_steps, initial_state, self._transition_function, keep_final_unfinished_states=False, ) outputs["best_action_sequence"] = [] outputs["debug_info"] = [] outputs["entities"] = [] outputs["predicted_sql_query"] = [] outputs["sql_queries"] = [] outputs["utterance"] = [] outputs["tokenized_utterance"] = [] for i in range(batch_size): # Decoding may not have terminated with any completed valid SQL queries, if `num_steps` # isn't long enough (or if the model is not trained enough and gets into an # infinite action loop). if i not in best_final_states: self._exact_match(0) self._denotation_accuracy(0) self._valid_sql_query(0) self._action_similarity(0) outputs["predicted_sql_query"].append("") continue best_action_indices = best_final_states[i][0].action_history[0] action_strings = [ action_mapping[(i, action_index)] for action_index in best_action_indices ] predicted_sql_query = action_sequence_to_sql(action_strings) if target_action_sequence is not None: # Use a Tensor, not a Variable, to avoid a memory leak. targets = target_action_sequence[i].data sequence_in_targets = 0 sequence_in_targets = self._action_history_match( best_action_indices, targets) self._exact_match(sequence_in_targets) similarity = difflib.SequenceMatcher( None, best_action_indices, targets) self._action_similarity(similarity.ratio()) if sql_queries and sql_queries[i]: denotation_correct = self._executor.evaluate_sql_query( predicted_sql_query, sql_queries[i]) self._denotation_accuracy(denotation_correct) outputs["sql_queries"].append(sql_queries[i]) outputs["utterance"].append(world[i].utterances[-1]) outputs["tokenized_utterance"].append([ token.text for token in world[i].tokenized_utterances[-1] ]) outputs["entities"].append(world[i].entities) outputs["best_action_sequence"].append(action_strings) outputs["predicted_sql_query"].append( sqlparse.format(predicted_sql_query, reindent=True)) outputs["debug_info"].append( best_final_states[i][0].debug_info[0]) # type: ignore return outputs
def similarity(L1, L2): matcher = difflib.SequenceMatcher(None, L1, L2) return matcher.ratio()
# print(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)) #[0:1] get the first row of the sparse matrix container = cosine_similarity( tfidf_matrix[0:1], tfidf_matrix) #compare first with every body (first,everybody) print(len(container[0])) similarityScore = [] for each in container[0]: similarityScore.append(each) # print(similarityScore.sort()) # print(similarityScore) print('program ended success') weighted_results = [] for result in filteredPosts: ratio = difflib.SequenceMatcher(None, result, textToCheck[0]).ratio() weighted_results.append((result, ratio)) # print(weighted_results) # print(sorted(weighted_results, key=lambda x: x[1])) # print(sorted(weighted_results)) # print('last data is ',weighted_results[0]) #filter data whose score is>=50% for each in weighted_results: if ((each[1] * 100) > 40): print(each)
import difflib import sys # az alábbi szerint a 'difflib' és a 'Levenshtein' kb. ua # https://stackoverflow.com/questions/6690739 word1 = sys.argv[1] word2 = sys.argv[2] score = difflib.SequenceMatcher(None, word1, word2).ratio() print(f"{word1} {word2} {score}")
def diff(fseq1, fseq2): """Use difflib to compute differences of two sequences of strings.""" differ = difflib.SequenceMatcher(a=fseq1, b=fseq2, autojunk=False) return differ.get_opcodes()
def getSimilarity(w1, w2): seq = difflib.SequenceMatcher(None, w1, w2) d = seq.ratio() * 100 return d
def rmsdRef(): lines = [] with open('v92.finalResult', 'r') as f: lines = f.readlines()[1:] seeds = [] energies = [] nativeRMSDs = [] for line in lines: terms = line.split() seeds.append(int(terms[0])) energies.append(float(terms[8])) nativeRMSDs.append(terms[9]) energies, seeds, nativeRMSDs = (list(t) for t in \ zip(*sorted(zip(energies, seeds, nativeRMSDs)))) cwd = os.getcwd() pattern = None with open('v92.con', 'r') as f: conLines = f.readlines() for line in conLines: if 'subjob_control' in line: terms = line.split() pattern = terms[2] structs = [] for i in range(len(seeds)): for dir in os.listdir(os.path.join(cwd, 'subJobs')): if dir.split('_')[0] == str(seeds[i]): os.chdir(os.path.join(cwd, 'subJobs', dir)) if 'plop.stdout' in os.listdir('.'): stName = '4KUZ-p' + str( pattern) + '-' + nativeRMSDs[i] + '_template.maegz' structs.append(next(structure.StructureReader(stName))) os.chdir(cwd) minStruct = copy.deepcopy(structs[0]) ALLINDICES = analyze.evaluate_asl(minStruct, ALLINDICES_asl) LOOPENVINDICES = analyze.evaluate_asl(minStruct, LOOPENVINDICES_asl) NONLOOPINDICES = analyze.evaluate_asl(minStruct, NONLOOPINDICES_asl) rmsds = [] for i in range(0, len(structs)): curStruct = structs[i] rmsd.superimpose(minStruct, NONLOOPINDICES, curStruct, NONLOOPINDICES) RMSD = rmsd.calculate_in_place_rmsd(minStruct, LOOPENVINDICES, curStruct, LOOPENVINDICES) rmsds.append(RMSD) # What about Hbond patterns? hbonds = [] for i in range(0, len(structs)): curStruct = structs[i] hbonds.append(hbond.get_hydrogen_bonds(curStruct, LOOPENVINDICES)) hbondIndices = [] for i in range(0, len(hbonds)): structIndices = [] hbondIndices.append(structIndices) for j in range(0, len(hbonds[i])): pairIndices = [] hbondIndices[i].append(pairIndices) for k in range(0, 2): hbondIndices[i][j].append(hbonds[i][j][k].index) min_hb_indices = copy.deepcopy(hbondIndices[0]) hbond_overlaps = [] for i in range(0, len(hbondIndices)): li1 = [tuple(lst) for lst in min_hb_indices] li2 = [tuple(lst) for lst in hbondIndices[i]] overlap = [] for pair in li1: if pair in li2: overlap.append(pair) sm = difflib.SequenceMatcher(None, li1, li2) hbond_overlaps.append(round(sm.ratio(), 5)) # What about salt bridge interactions? bridges = [] for i in range(0, len(structs)): curStruct = structs[i] bridges.append(salt_bridge.get_salt_bridges(curStruct, LOOPENVINDICES)) bridgeIndices = [] for i in range(0, len(bridges)): structIndices = [] bridgeIndices.append(structIndices) for j in range(0, len(bridges[i])): pairIndices = [] bridgeIndices[i].append(pairIndices) for k in range(0, 2): bridgeIndices[i][j].append(bridges[i][j][k].index) min_bridge_indices = copy.deepcopy(bridgeIndices[0]) salt_bridge_overlaps = [] for i in range(0, len(bridgeIndices)): li1 = [tuple(lst) for lst in min_bridge_indices] li2 = [tuple(lst) for lst in bridgeIndices[i]] overlap = [] for pair in li1: if pair in li2: overlap.append(pair) sm = difflib.SequenceMatcher(None, li1, li2) salt_bridge_overlaps.append(round(sm.ratio(), 5)) # Hydrophobic interactions print('SEED\t\tRMSD\t\tHBOND_OVERLAP\tSALTBR_OVERLAP\tENERGY') for i in range(0, len(rmsds)): print( str(seeds[i]) + '\t\t' + str(round(rmsds[i], 3)) + '\t\t' + str(hbond_overlaps[i] * 100) + '\t\t' + str(salt_bridge_overlaps[i] * 100) + '\t\t' + str(energies[i]))
tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'} # list of tuples with word and part of speech in each tuple, len(sent1) tuples s1 = nltk.pos_tag(nltk.word_tokenize(sent1)) s1_pos = [] for tup in s1: s1_pos.append(tup[1]) s2 = nltk.pos_tag(nltk.word_tokenize(sent2)) s2_pos = [] for tup in s2: s2_pos.append(tup[1]) # the parts of speech of one translation matched with another with smart sequence matcher sm = difflib.SequenceMatcher(None, s1_pos, s2_pos) res = sm.ratio() machine_or_no_int.append(int(machine_or_no[i])) # Simmilarity between the sentences using synsets... s1 = dict( filter( lambda x: len(x[1]) > 0, map( lambda row: (row[0], wn.synsets(row[0], tag_dict[row[1][0]])) if row[1][0] in tag_dict.keys() else (row[0], []), s1))) s2 = nltk.pos_tag(nltk.word_tokenize(sent2)) s2 = dict(
def longest(a, b): match = difflib.SequenceMatcher(None, a, b) m = match.find_longest_match(0, len(a), 0, len(b)) return a[m.a:m.a + m.size]
def overallcpp(program_name,testclass,refcode,program=None,orig_program=None,lintoptions=STDLINT,compile=True): if not orig_program: orig_program = program_name s = 'Checking {} for EC602 submission.\n'.format(orig_program) if not program: program=program_name[:-4] try: f=open(program_name) the_program = f.read() f.close() except: s += 'The program {} does not exist here.\n'.format(orig_program) return 'No file',s authors = get_authors(the_program,progtype(program_name)) includes = get_includes(the_program) s += '\n---- analysis of your code structure ----\n\n' s += 'authors : {}\n'.format(" ".join(authors) if authors else AUTHWARN) s += 'included libs : {}\n'.format(" ".join(includes)) if compile: C = subprocess.run(['g++','-std=c++14',program_name, '-o', program], stderr=subprocess.PIPE) print(C) s += 'compile : {}\n'.format("error" if C.returncode else "ok") comments = 0 for line in the_program.splitlines(): if '//' in line: comments += 1 P_astyle = subprocess.run(['astyle', *ASTYLE_OPTIONS,program_name], stdout=subprocess.PIPE,stderr=subprocess.PIPE) if P_astyle.returncode: s += 'astyle : error {}'.format(P_astyle.stderr.decode()) unchanged = 1 if P_astyle.stdout.decode().startswith('Formatted'): Original = open(program_name+".orig").readlines() Newprog = open(program_name).readlines() m = difflib.SequenceMatcher() m.set_seqs(Original,Newprog) unchanged = m.ratio() s += "astyle : {:.1%} code unchanged.\n".format(unchanged) cpplint_call_list = ['cpplint','--filter='+','.join(lintoptions),program_name] P_lint = subprocess.run(cpplint_call_list, stderr=subprocess.PIPE) prob=False if P_lint.returncode: prob = P_lint.stderr.decode().rsplit(" ",1)[-1].strip() s += "cpplint : {}\n".format("{} problems".format(prob) if prob else "ok") cpplint_call_list = ['cpplint','--filter='+','.join(lintoptions),orig_program] s += ' [{}]\n'.format(' '.join(cpplint_call_list)) CA = code_analysis_cpp(program_name) s += "lines of code : {}, {:4.0%} of reference\n".format(CA['lines'],CA['lines']/refcode['lines']) s += "tokens in code: {}, {:4.0%} of reference\n".format(CA['words'],CA['words']/refcode['words']) s += "comments : {}\n".format(comments) s += '\n---- check of requirements ----\n' try: errors,passed,gradesummary = check_program(testclass) except unittest.SkipTest as e: s+= str(e) return "Errors",s,{'pass':[],'fail':[]} for p in passed: s += p if errors: s += '-----------------errors found--------------\n' for e in errors: s += e + "\n-------\n" if errors: return 'Errors',s,gradesummary else: return 'Pass',s,gradesummary
def _execute_cb(self, goal): rospy.loginfo('Received a new request to start behavior: %s' % goal.behavior_name) be_id, behavior = self._behavior_lib.find_behavior(goal.behavior_name) if be_id is None: Logger.logerr("Did not find behavior with requested name: %s" % goal.behavior_name) self._as.set_preempted() return be_selection = BehaviorSelection() be_selection.behavior_id = be_id be_selection.autonomy_level = 255 try: for k, v in zip(goal.arg_keys, goal.arg_values): if v.startswith('file://'): v = v.replace('file://', '', 1) path = v.split(':')[0] if len(v.split(':')) > 1: ns = v.split(':')[1] else: ns = '' if path.startswith('~') or path.startswith('/'): filepath = os.path.expanduser(path) else: filepath = os.path.join( self._rp.get_path(path.split('/')[0]), '/'.join(path.split('/')[1:])) with open(filepath, 'r') as f: content = f.read() if ns != '': content = yaml.load(content) if ns in content: content = content[ns] content = yaml.dump(content) be_selection.arg_keys.append(k) be_selection.arg_values.append(content) else: be_selection.arg_keys.append(k) be_selection.arg_values.append(v) except Exception as e: rospy.logwarn( 'Failed to parse and substitute behavior arguments, will use direct input.\n%s' % str(e)) be_selection.arg_keys = goal.arg_keys be_selection.arg_values = goal.arg_values be_selection.input_keys = goal.input_keys be_selection.input_values = goal.input_values # check for local modifications of the behavior to send them to the onboard behavior be_filepath_new = self._behavior_lib.get_sourcecode_filepath(be_id) with open(be_filepath_new, "r") as f: be_content_new = f.read() be_filepath_old = self._behavior_lib.get_sourcecode_filepath( be_id, add_tmp=True) if not os.path.isfile(be_filepath_old): be_selection.behavior_checksum = zlib.adler32(be_content_new) else: with open(be_filepath_old, "r") as f: be_content_old = f.read() sqm = difflib.SequenceMatcher(a=be_content_old, b=be_content_new) diffs = [x[1] for x in sqm.get_grouped_opcodes(0)] for opcode, a0, a1, b0, b1 in diffs: content = be_content_new[b0:b1] be_selection.modifications.append( BehaviorModification(a0, a1, content)) be_selection.behavior_checksum = zlib.adler32(be_content_new) # reset state before starting new behavior self._engine_status = None self._current_state = None self._behavior_started = False # start new behavior self._pub.publish(be_selection) try: rate = rospy.Rate(10) while not rospy.is_shutdown(): if self._current_state is not None: self._as.publish_feedback( BehaviorExecutionFeedback(self._current_state)) self._current_state = None # check if goal has been preempted first if self._as.is_preempt_requested(): rospy.loginfo('Behavior execution preempt requested!') self._preempt_pub.publish() rate.sleep() self._as.set_preempted('') break if self._engine_status is None: rospy.logdebug_throttle( 1, 'No behavior engine status received yet. Waiting for it...' ) rate.sleep() continue if self._engine_status.code == BEStatus.ERROR: rospy.logerr( 'Failed to run behavior! Check onboard terminal for further infos.' ) rate.sleep() self._as.set_aborted('') break if not self._behavior_started: rospy.logdebug_throttle( 1, 'Behavior execution has not yet started. Waiting for it...' ) rate.sleep() continue if self._engine_status.code == BEStatus.FINISHED: result = self._engine_status.args[0] \ if len(self._engine_status.args) >= 1 else '' rospy.loginfo( 'Finished behavior execution with result "%s"!' % result) self._as.set_succeeded( BehaviorExecutionResult(outcome=result)) break if self._engine_status.code == BEStatus.FAILED: rospy.logerr('Behavior execution failed in state %s!' % str(self._current_state)) rate.sleep() self._as.set_aborted('') break rate.sleep() rospy.loginfo('Ready for next behavior start request.') except rospy.ROSInterruptException: pass # allow clean exit on ROS shutdown
def quickRatio(u1, u2): rawRequest = getContent(u1) checkWafRequest = getContent(u2) retVal = difflib.SequenceMatcher(None, rawRequest, checkWafRequest).quick_ratio() return retVal
def diff_text(a, b): """ Performs a diffing algorithm on two pieces of text. Returns a string of HTML containing the content of both texts with <span> tags inserted indicating where the differences are. """ def tokenise(text): """ Tokenises a string by splitting it into individual characters and grouping the alphanumeric ones together. This means that punctuation, whitespace, CJK characters, etc become separate tokens and words/numbers are merged together to form bigger tokens. This makes the output of the diff easier to read as words are not broken up. """ tokens = [] current_token = "" for c in text or "": if c.isalnum(): current_token += c else: if current_token: tokens.append(current_token) current_token = "" tokens.append(c) if current_token: tokens.append(current_token) return tokens a_tok = tokenise(a) b_tok = tokenise(b) sm = difflib.SequenceMatcher(lambda t: len(t) <= 4, a_tok, b_tok) changes = [] for op, i1, i2, j1, j2 in sm.get_opcodes(): if op == "replace": for token in a_tok[i1:i2]: changes.append(("deletion", token)) for token in b_tok[j1:j2]: changes.append(("addition", token)) elif op == "delete": for token in a_tok[i1:i2]: changes.append(("deletion", token)) elif op == "insert": for token in b_tok[j1:j2]: changes.append(("addition", token)) elif op == "equal": for token in a_tok[i1:i2]: changes.append(("equal", token)) # Merge adjacent changes which have the same type. This just cleans up the HTML a bit merged_changes = [] current_value = [] current_change_type = None for change_type, value in changes: if change_type != current_change_type: if current_change_type is not None: merged_changes.append( (current_change_type, "".join(current_value))) current_value = [] current_change_type = change_type current_value.append(value) if current_value: merged_changes.append((current_change_type, "".join(current_value))) return TextDiff(merged_changes)
try: questions = get_questions() except IOError as e: print 'Error reading questions file %s' % e sys.exit() except IndexError: print 'Error: all questions in the questions file must have answers.' sys.exit() score = 0 total = len(questions) for question, answer in questions: guesses = 1 correct = 'no' while guesses < 4 and correct == 'no': guess = raw_input(question.strip() + ' (Guess %s)\n' % guesses) q = difflib.SequenceMatcher(None, guess, answer) # print round(q.ratio()*100, 1) if round(q.ratio() * 100, 1) == 100: print '---CORRECT---' score += 1 correct = 'yes' else: print '---WRONG---' guesses += 1 print 'You got %s out of %s questions right' % (score, total)