def _find_irregular(self, common_parent, potential_parents, cluster_units): unit = cluster_units[0] unit_tags_names = map(lambda x : x.name, unit.tags) typ = unit.typ unit_length = len(unit.tags) # units in the same cluster are expected to have the same number of tags for p in potential_parents: units_in_p = filter(lambda x : x.parent == p, cluster_units) tags = filter(lambda x : isinstance(x, Tag), p.contents) occupied_indices = [] for un in units_in_p: start, stop = un.get_tags_interval() occupied_indices.extend(range(start, stop+1)) if typ == 1: count = unit.get_level_one_tags_count() occupied_indices.extend(range(start+1, start+count)) already_checked_instances = [] for i in range(0, len(tags)): if i not in occupied_indices and i not in already_checked_instances: new_unit_tags = [] for j in range(i, i+unit_length): if j in occupied_indices or j >= len(tags): break if tags[j].name in unit_tags_names: new_unit_tags.append(tags[j]) if typ == 1: ts = filter(lambda x : isinstance(x, Tag), tags[j].contents) new_unit_tags.extend(ts) already_checked_instances.append(j) new_unit_tags_names = map(lambda x : x.name, new_unit_tags) lev = self._levenshtein(unit_tags_names, new_unit_tags_names) if abs(len(unit_tags_names) - len(new_unit_tags)) <= 1 and lev <= 1: new_unit = Unit(self, p, new_unit_tags, unit.level, typ, unit.pattern, unit.parent_tag_indices) new_unit.label = unit.label new_unit.irregular = True self._logger.info("irregular unit found ---------------------------------") self._logger.info("new irregular unit: %s" % new_unit) self.units.append(new_unit)