def test_merge_is_order_independent(self): root_one = [ self._create_chain(['avatar', 'the', 'legend', 'of', 'korra']), self._create_chain(['la', 'leggenda', 'di', 'korra']), self._create_chain(['the', 'last', 'airbender', 'the', 'legend', 'of', 'korra']) ] self._create_chain(['legend', 'of', 'korra'], root_one[-1]) root_one.append(self._create_chain(['legend', 'of', 'korra'])) result_one = self.merge.merge(root_one) Logr.debug("-----------------------------------------------------------------") root_two = [ self._create_chain(['the', 'legend', 'of', 'korra']), ] self._create_chain(['last', 'airbender', 'the', 'legend', 'of', 'korra'], root_two[-1]) root_two += [ self._create_chain(['legend', 'of', 'korra']), self._create_chain(['la', 'leggenda', 'di', 'korra']), self._create_chain(['avatar', 'the', 'legend', 'of', 'korra']) ] result_two = self.merge.merge(root_two) Logr.debug("=================================================================") assert itemsMatch( self._get_chain_values(result_one), self._get_chain_values(result_two) )
def parse_closure(self, parent_head, subject): parent_node = parent_head[0] if type(parent_head) is list else parent_head nodes, match = self.match(parent_head, parent_node, subject) # Capturing broke on constraint, return now if not match: return nodes Logr.debug('created closure node with subject.value: "%s"' % subject.value) result = [CaperClosureNode( subject, parent_head, match )] # Branch if the match was indefinite (weight below 1.0) if match.result and match.weight < 1.0: if match.num_fragments == 1: result.append(CaperClosureNode(subject, parent_head)) else: nodes.append(CaperClosureNode(subject, parent_head)) nodes.append(result[0] if len(result) == 1 else result) return nodes
def _merge(self, nodes, depth = 0): Logr.debug(str('\t' * depth) + str(nodes)) if not len(nodes): return [] top = nodes[0] # Merge into top for x in range(len(nodes)): # Merge extra results into top if x > 0: top.value = None top.weight += nodes[x].weight self.destroy_nodes_right(top.right) if len(nodes[x].right): top.join_right(nodes[x].right) Logr.debug("= %s joined %s", nodes[x], top) nodes[x].dead = True nodes = [n for n in nodes if not n.dead] # Traverse further for node in nodes: if len(node.right): node.right = self._merge(node.right, depth + 1) return nodes
def parse(self, titles): root = [] tails = [] for title in titles: Logr.debug(title) cur = None words = title.split(' ') for wx in xrange(len(words)): word = strip(words[wx]) if cur is None: cur = find_node(root, word) if cur is None: cur = DNode(word, None, num_children=len(words) - wx, original_value=title) root.append(cur) else: parent = cur parent.weight += 1 cur = find_node(parent.right, word) if cur is None: Logr.debug("%s %d", word, len(words) - wx) cur = DNode(word, parent, num_children=len(words) - wx) sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children) else: cur.weight += 1 tails.append(cur) return root, tails
def capture_closure(self, tag, regex=None, func=None, single=True, **kwargs): Logr.debug('capture_closure("%s", "%s", %s, %s)', tag, regex, func, single) if self.step_source != 'closure': if self.step_source is None: self.step_source = 'closure' else: raise ValueError( "Unable to mix fragment and closure capturing in a group") self.steps.append( CaptureStep(self, tag, 'closure', regex=regex, func=func, single=single, **kwargs)) return self
def merge(self, root): for x in range(len(root)): Logr.debug(root[x]) root[x].right = self._merge(root[x].right) Logr.debug('=================================================================') return root
def calculate_sim_links(for_node, other_nodes): for node in other_nodes: if node in for_node.links: continue Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value) # Get similarity similarity_matcher = create_matcher(for_node.value, node.value) similarity = similarity_matcher.quick_ratio() # Get for_node -> node opcodes a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False) a_opcodes = a_opcodes_matcher.get_opcodes() a_stats = get_opcode_stats(for_node, node, a_opcodes) Logr.debug('-' * 100) # Get node -> for_node opcodes b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False) b_opcodes = b_opcodes_matcher.get_opcodes() b_stats = get_opcode_stats(for_node, node, b_opcodes) for_node.links[node] = SimLink(similarity, a_opcodes, a_stats) node.links[for_node] = SimLink(similarity, b_opcodes, b_stats)
def parse_closure(self, parent_head, subject): parent_node = parent_head[0] if type( parent_head) is list else parent_head nodes, match = self.match(parent_head, parent_node, subject) # Capturing broke on constraint, return now if not match: return nodes Logr.debug('created closure node with subject.value: "%s"' % subject.value) result = [CaperClosureNode(subject, parent_head, match)] # Branch if the match was indefinite (weight below 1.0) if match.result and match.weight < 1.0: if match.num_fragments == 1: result.append(CaperClosureNode(subject, parent_head)) else: nodes.append(CaperClosureNode(subject, parent_head)) nodes.append(result[0] if len(result) == 1 else result) return nodes
def print_tree(node, depth = 0): Logr.debug(str('\t' * depth) + str(node)) if len(node.right): for child in node.right: print_tree(child, depth + 1) else: Logr.debug(node.full_value()[1])
def load(self): parser = ConfigParser.ConfigParser() parser.read(os.path.join(self.base_dir, 'data.cfg')) for module_name, spec in self.module_loader.modules.items(): if parser.has_section(module_name): self.load_module(parser, module_name, spec) else: Logr.debug("no section named '%s'" % module_name)
def parse_subject(self, parent_head, subject): Logr.debug("parse_subject (%s) subject: %s", self.step_source, repr(subject)) if type(subject) is CaperClosure: return self.parse_closure(parent_head, subject) if type(subject) is CaperFragment: return self.parse_fragment(parent_head, subject) raise ValueError('Unknown subject (%s)', subject)
def fragment_match(self, fragment, group_name=None): """Follow a fragment chain to try find a match :type fragment: caper.objects.CaperFragment :type group_name: str or None :return: The weight of the match found between 0.0 and 1.0, where 1.0 means perfect match and 0.0 means no match :rtype: (float, dict, int) """ group_name, weight_groups = self.find_group(group_name) for weight, patterns in weight_groups: for pattern in patterns: success = True result = {} num_matched = 0 fragment_iterator = fragment.take_right( return_type='value', include_separators=pattern.include_separators, include_source=True ) for subject, fragment_pattern in itertools.izip_longest(fragment_iterator, pattern): # No patterns left to match if not fragment_pattern: break # No fragments left to match against pattern if not subject: success = False break value, source = subject matches = pattern.execute(fragment_pattern, value) if matches: for match in pattern.process(matches): update_dict(result, match) else: success = False break if source == 'subject': num_matched += 1 if success: Logr.debug('Found match with weight %s using regex pattern "%s"' % (weight, [sre.pattern for sre in pattern.patterns])) return float(weight), result, num_matched return 0.0, None, 1
def print_link_tree(nodes): for node in nodes: Logr.debug(node.value) Logr.debug('\tnum_merges: %s', node.num_merges) if len(node.links): Logr.debug('\t========== LINKS ==========') for link_node, link in node.links.items(): Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) Logr.debug('\t---------------------------')
def kill_nodes_above(nodes, above_sim): killed_nodes = [] for node in nodes: if node.dead: continue Logr.debug(node.value) for link_node, link in node.links.items(): if link_node.dead: continue Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) if link.similarity >= above_sim: if len(link_node.value) > len(node.value): Logr.debug('\t\tvery similar, killed this node') link_node.dead = True node.num_merges += 1 killed_nodes.append(link_node) else: Logr.debug('\t\tvery similar, killed owner') node.dead = True link_node.num_merges += 1 killed_nodes.append(node) kill_nodes(nodes, killed_nodes)
def parse(self, name, parser='scene'): closures = self._closure_split(name) closures = self._fragment_split(closures) # Print closures for closure in closures: Logr.debug("closure [%s]", closure.value) for fragment in closure.fragments: Logr.debug("\tfragment [%s]", fragment.value) if parser not in self.parsers: raise ValueError("Unknown parser") # TODO autodetect the parser type return self.parsers[parser](self.debug).run(closures)
def check_constraints(self, constraints, parent_head, subject, **kwargs): parent_node = parent_head[0] if type(parent_head) is list else parent_head # Check constraints for constraint in [c for c in constraints if c.target == subject.__key__ or not c.target]: Logr.debug("Testing constraint %s against subject %s", repr(constraint), repr(subject)) weight, success = constraint.execute(parent_node, subject, **kwargs) if success: Logr.debug('capturing broke on "%s" at %s', subject.value, constraint) parent_node.finished_groups.append(self) return True, weight == 1.0 return False, None
def kill_trailing_nodes(nodes): killed_nodes = [] for node in nodes: if node.dead: continue Logr.debug(node.value) for link_node, link in node.links.items(): if link_node.dead: continue is_valid = link.stats.get('valid', False) has_deletions = False has_insertions = False has_replacements = False for opcode in link.opcodes: if opcode[0] == 'delete': has_deletions = True if opcode[0] == 'insert': has_insertions = True if opcode[0] == 'replace': has_replacements = True equal_perc = link.stats.get('equal', 0) / float(len(node.value)) insert_perc = link.stats.get('insert', 0) / float(len(node.value)) Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format( 'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements), len(node.value), len(link_node.value), link.stats.get('equal', 0), equal_perc * 100, insert_perc * 100, '"{0}"'.format(link_node.value) )) Logr.debug('\t\t%s', link.stats) kill = all([ is_valid, equal_perc >= 0.5, insert_perc < 2, has_insertions, not has_deletions, not has_replacements ]) if kill: Logr.debug('\t\tkilled this node') link_node.dead = True node.num_merges += 1 killed_nodes.append(link_node) kill_nodes(nodes, killed_nodes)
def post(self, widget_id, client_id=None, **kwargs): # Create message message = kwargs message.update({ 'id': widget_id, 'updatedAt': int(time.time() + 300) }) # Queue for clients to send clients = self.clients.items() if client_id: clients = [(client_id, self.clients[client_id])] for cid, client in clients: client['messages'][client['next_mid']] = message client['next_mid'] += 1 Logr.debug("Message posted, queued to be sent to %s clients" % len(self.clients))
def capture_closure(self, tag, regex=None, func=None, single=True, **kwargs): Logr.debug('capture_closure("%s", "%s", %s, %s)', tag, regex, func, single) if self.step_source != 'closure': if self.step_source is None: self.step_source = 'closure' else: raise ValueError("Unable to mix fragment and closure capturing in a group") self.steps.append(CaptureStep( self, tag, 'closure', regex=regex, func=func, single=single, **kwargs )) return self
def fragment_match(self, fragment, group_name=None): """Follow a fragment chain to try find a match :type fragment: caper.objects.CaperFragment :type group_name: str or None :return: The weight of the match found between 0.0 and 1.0, where 1.0 means perfect match and 0.0 means no match :rtype: (float, dict, int) """ group_name, weight_groups = self.find_group(group_name) for weight, patterns in weight_groups: for pattern in patterns: cur_fragment = fragment success = True result = {} # Ignore empty patterns if len(pattern) < 1: break for fragment_pattern in pattern: if not cur_fragment: success = False break match = fragment_pattern.match(cur_fragment.value) if match: update_dict(result, match.groupdict()) else: success = False break cur_fragment = cur_fragment.right if cur_fragment else None if success: Logr.debug("Found match with weight %s" % weight) return float(weight), result, len(pattern) return 0.0, None, 1
def parse(self, name, parser='scene'): if not name: return None closures = self._closure_split(name) closures = self._fragment_split(closures) # Print closures for closure in closures: Logr.debug("closure [%s]", closure.value) for fragment in closure.fragments: Logr.debug("\tfragment (%s)[%s](%s)", fragment.left_sep, fragment.value, fragment.right_sep) if parser not in self.parsers: raise ValueError("Unknown parser") if not closures: return None # TODO autodetect the parser type return self.parsers[parser](self.debug).run(closures)
def check_constraints(self, constraints, parent_head, subject, **kwargs): parent_node = parent_head[0] if type( parent_head) is list else parent_head # Check constraints for constraint in [ c for c in constraints if c.target == subject.__key__ or not c.target ]: Logr.debug("Testing constraint %s against subject %s", repr(constraint), repr(subject)) weight, success = constraint.execute(parent_node, subject, **kwargs) if success: Logr.debug('capturing broke on "%s" at %s', subject.value, constraint) parent_node.finished_groups.append(self) return True, weight == 1.0 return False, None
def read(self): client_id = self.next_id self.next_id += 1 self.clients[client_id] = { 'next_mid': 1, 'messages': {} } for func in self.functions: func(client_id=client_id) while client_id in self.clients: sent = [] for mid, message in self.clients[client_id]['messages'].items(): Logr.debug("Sent message: %s" % message) yield "data: %s\n\n" % json.dumps(message) sent.append(mid) # Remove sent messages for mid in sent: self.clients[client_id]['messages'].pop(mid) time.sleep(5)
def match(self, parent_head, parent_node, subject): nodes = [] # Check pre constaints broke, definite = self.check_constraints(self.pre_constraints, parent_head, subject) if broke: nodes.append(parent_head) if definite: return nodes, None # Try match subject against the steps available match = None for step in self.steps: if step.source == 'closure' and type(subject) is not CaperClosure: pass elif step.source == 'fragment' and type(subject) is CaperClosure: Logr.debug( 'Closure encountered on fragment step, jumping into fragments' ) return [CaperClosureNode(subject, parent_head, None)], None match = step.execute(subject) if match.success: if type(match.result) is dict: match.result = clean_dict(match.result) Logr.debug( 'Found match with weight %s, match: %s, num_fragments: %s' % (match.weight, match.result, match.num_fragments)) step.matched = True break if all([step.single and step.matched for step in self.steps]): Logr.debug('All steps completed, group finished') parent_node.finished_groups.append(self) return nodes, match # Check post constraints broke, definite = self.check_constraints(self.post_constraints, parent_head, subject, match=match) if broke: return nodes, None return nodes, match
def build(self): max_matched = 0 for head in self.heads: for chain in self.combine_chain(head): if chain.num_matched > max_matched: max_matched = chain.num_matched self.chains.append(chain) for chain in self.chains: chain.weights.append(chain.num_matched / float(max_matched or chain.num_matched or 1)) chain.finish() self.chains.sort(key=lambda chain: chain.weight, reverse=True) for chain in self.chains: Logr.debug("chain weight: %.02f", chain.weight) Logr.debug("\tInfo: %s", chain.info) Logr.debug("\tWeights: %s", chain.weights) Logr.debug("\tNumber of Fragments Matched: %s", chain.num_matched)
def execute(self, fragment): """Execute step on fragment :type fragment: CaperFragment :rtype : CaptureMatch """ match = CaptureMatch(self.tag, self) if self.regex: weight, result, num_fragments = self.capture_group.parser.matcher.fragment_match( fragment, self.regex) Logr.debug('(execute) [regex] tag: "%s"', self.tag) if not result: return match # Populate CaptureMatch match.success = True match.weight = weight match.result = result match.num_fragments = num_fragments elif self.func: result = self.func(fragment) Logr.debug('(execute) [func] %s += "%s"', self.tag, match) if not result: return match # Populate CaptureMatch match.success = True match.weight = 1.0 match.result = result else: Logr.debug('(execute) [raw] %s += "%s"', self.tag, fragment.value) include_separators = self.kwargs.get('include_separators', False) # Populate CaptureMatch match.success = True match.weight = 1.0 if include_separators: match.result = (fragment.left_sep, fragment.value, fragment.right_sep) else: match.result = fragment.value return match
def execute(self, fragment): """Execute step on fragment :type fragment: CaperFragment :rtype : CaptureMatch """ match = CaptureMatch(self.tag, self) if self.regex: weight, result, num_fragments = self.capture_group.parser.matcher.fragment_match(fragment, self.regex) Logr.debug('(execute) [regex] tag: "%s"', self.tag) if not result: return match # Populate CaptureMatch match.success = True match.weight = weight match.result = result match.num_fragments = num_fragments elif self.func: result = self.func(fragment) Logr.debug('(execute) [func] %s += "%s"', self.tag, match) if not result: return match # Populate CaptureMatch match.success = True match.weight = 1.0 match.result = result else: Logr.debug('(execute) [raw] %s += "%s"', self.tag, fragment.value) include_separators = self.kwargs.get('include_separators', False) # Populate CaptureMatch match.success = True match.weight = 1.0 if include_separators: match.result = (fragment.left_sep, fragment.value, fragment.right_sep) else: match.result = fragment.value return match
def match(self, parent_head, parent_node, subject): nodes = [] # Check pre constaints broke, definite = self.check_constraints(self.pre_constraints, parent_head, subject) if broke: nodes.append(parent_head) if definite: return nodes, None # Try match subject against the steps available match = None for step in self.steps: if step.source == 'closure' and type(subject) is not CaperClosure: pass elif step.source == 'fragment' and type(subject) is CaperClosure: Logr.debug('Closure encountered on fragment step, jumping into fragments') return [CaperClosureNode(subject, parent_head, None)], None match = step.execute(subject) if match.success: if type(match.result) is dict: match.result = clean_dict(match.result) Logr.debug('Found match with weight %s, match: %s, num_fragments: %s' % ( match.weight, match.result, match.num_fragments )) step.matched = True break if all([step.single and step.matched for step in self.steps]): Logr.debug('All steps completed, group finished') parent_node.finished_groups.append(self) return nodes, match # Check post constraints broke, definite = self.check_constraints(self.post_constraints, parent_head, subject, match=match) if broke: return nodes, None return nodes, match
def get_opcode_stats(for_node, node, opcodes): stats = {} for tag, i1, i2, j1, j2 in opcodes: Logr.debug(stats_print_format.format( tag, i1, i2, j1, j2 )) if tag in ['insert', 'delete']: ax = None, None bx = None, None if tag == 'insert': ax = get_indices(for_node.value, i1 - 1, i1) bx = get_indices(node.value, j1, j2 - 1) if tag == 'delete': ax = get_indices(for_node.value, j1 - 1, j1) bx = get_indices(node.value, i1, i2 - 1) av = get_index_values(for_node.value, *ax) bv = get_index_values(node.value, *bx) Logr.debug( '\t\t%s %s [%s><%s] <---> %s %s [%s><%s]', ax, av, av[0], av[1], bx, bv, bv[0], bv[1] ) head_valid = av[0] in [None, ' '] or bv[0] in [None, ' '] tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' '] valid = head_valid and tail_valid if 'valid' not in stats or (stats['valid'] and not valid): stats['valid'] = valid Logr.debug('\t\t' + ('VALID' if valid else 'INVALID')) if tag not in stats: stats[tag] = 0 stats[tag] += (i2 - i1) or (j2 - j1) return stats
def print_tree(self, heads): if not self.debug: return for head in heads: head = head if type(head) is list else [head] if type(head[0]) is CaperFragmentNode: for fragment in head[0].fragments: Logr.debug(fragment.value) else: Logr.debug(head[0].closure.value) for node in head: Logr.debug('\t' + str(node).ljust(55) + '\t' + ( str(node.match.weight) + '\t' + str(node.match.result) ) if node.match else '') if len(head) > 0 and head[0].parent: self.print_tree([head[0].parent])
def load_module(self, parser, module_name, spec): for key, option in spec['options'].items(): if parser.has_option(module_name, key): spec['instance'].config[key] = self.get_option(parser, module_name, key, option) else: Logr.debug("no option named '%s' in section '%s'" % (key, module_name))
def execute(self): heads_finished = None while heads_finished is None or not (len(heads_finished) == len( self.result.heads) and all(heads_finished)): heads_finished = [] heads = self.result.heads self.result.heads = [] for head in heads: node = head[0] if type(head) is list else head if self in node.finished_groups: Logr.debug("head finished for group") self.result.heads.append(head) heads_finished.append(True) continue Logr.debug('') Logr.debug(node) next_subject = node.next() Logr.debug( '----------[%s] (%s)----------' % (next_subject, repr(next_subject.value) if next_subject else None)) if next_subject: for node_result in self.parse_subject(head, next_subject): self.result.heads.append(node_result) Logr.debug('Heads: %s', self.result.heads) heads_finished.append(self in node.finished_groups or next_subject is None) if len(self.result.heads) == 0: self.result.heads = heads Logr.debug("heads_finished: %s, self.result.heads: %s", heads_finished, self.result.heads) Logr.debug("group finished")
def run(self, titles): titles = distinct([simplify(title) for title in titles]) Logr.info(str(titles)) Logr.debug("------------------------------------------------------------") root, tails = self.parse(titles) Logr.debug("--------------------------PARSE-----------------------------") for node in root: print_tree(node) Logr.debug("--------------------------MERGE-----------------------------") self.merge(root) Logr.debug("--------------------------FINAL-----------------------------") for node in root: print_tree(node) Logr.debug("--------------------------RESULT-----------------------------") scores = {} results = [] for tail in tails: score, value, original_value = tail.full_value() if value in scores: scores[value] += score else: results.append((value, original_value)) scores[value] = score Logr.debug("%s %s %s", score, value, original_value) sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True) return [result[0] for result in sorted_results]
def discover(self, base_dir): base_dir = os.path.abspath(base_dir) Logr.debug("Discovering modules in \"%s\"..." % base_dir) self.discover_directory(base_dir, 'sources')
def run(self, titles): nodes = [] # Create a node for each title for title in titles: nodes.append(SimNode(title)) # Calculate similarities between nodes for node in nodes: calculate_sim_links(node, [n for n in nodes if n != node]) kill_nodes_above(nodes, 0.90) Logr.debug('---------------------------------------------------------------------') print_link_tree(nodes) Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) Logr.debug('---------------------------------------------------------------------') kill_trailing_nodes(nodes) Logr.debug('---------------------------------------------------------------------') # Sort remaining nodes by 'num_merges' nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True) print_link_tree(nodes) Logr.debug('---------------------------------------------------------------------') Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) return [n.value for n in nodes]
def execute(self): heads_finished = None while heads_finished is None or not (len(heads_finished) == len(self.result.heads) and all(heads_finished)): heads_finished = [] heads = self.result.heads self.result.heads = [] for head in heads: node = head[0] if type(head) is list else head if self in node.finished_groups: Logr.debug("head finished for group") self.result.heads.append(head) heads_finished.append(True) continue Logr.debug('') Logr.debug(node) next_subject = node.next() Logr.debug('----------[%s] (%s)----------' % (next_subject, repr(next_subject.value) if next_subject else None)) if next_subject: for node_result in self.parse_subject(head, next_subject): self.result.heads.append(node_result) Logr.debug('Heads: %s', self.result.heads) heads_finished.append(self in node.finished_groups or next_subject is None) if len(self.result.heads) == 0: self.result.heads = heads Logr.debug("heads_finished: %s, self.result.heads: %s", heads_finished, self.result.heads) Logr.debug("group finished")