def gather_and_scatter2( buckets_dict, similarity_metric: Callable[[Any, Any], Optional[float]]) -> Dict: debug("Computing bucket centroids") for bucket in buckets_dict.values(): bucket.compute_hashes_centroid_and_rmsd() debug("Computed bucket centroids") similarities: Dict[Hashable, Dict[Hashable, Any]] = compute_bucket_similarities_graph( list(buckets_dict.values()), similarity_metric) buckets_dict, crude_buckets = gather(buckets_dict, similarities) for crude_bucket in crude_buckets: crude_bucket.compute_hashes_centroid_and_rmsd() if crude_bucket.hashes_rmsd > 15: scatter_into(buckets_dict, buckets_dict, crude_bucket.tokenized_strings) else: new_pattern = crude_bucket.tokenized_strings[0] crude_bucket.pattern = new_pattern buckets_dict[tuple(new_pattern)] = crude_bucket return buckets_dict
def read_fd_or_default(fd, default): try: with os.fdopen(fd, 'r') as f: debug(f'Reading from FD {fd}') return json.load(f) except Exception: return default
def do_OPTIONS(self): debug(f'OPTIONS {self.path}') resource = self.path[len("/insight/data/"):].split('?')[0] resource_parts = resource.split('/') tier_schema = tdf.schema_tiers[len(resource_parts) - 1] self.respond(200, 'application/javascript', json.dumps(tier_schema).encode('utf-8'))
def scatter_into(buckets_dict_to, buckets_dict_from, tokenized_strings): debug(f"Computing buckets for {len(tokenized_strings)} strings") stats = list(compute_stats_for_tokenized(tokenized_strings)) token_to_quality = {stat.token: stat.quality for stat in stats} selected: Set[str] = compute_selected(stats) for tokens in tokenized_strings: sim_hash = seq_sim_hash(tokens, token_to_quality.get) raw_pattern, milestone_offsets = raw_pattern_and_milestone_offsets( tokens, selected) pattern, pattern_milestone_offsets = collapse_successive_wildcards( raw_pattern) pattern_tuple = tuple(pattern) bucket_from = buckets_dict_from.get(pattern_tuple) bucket_to = buckets_dict_to.get(pattern_tuple) if bucket_from is None and bucket_to is None: buckets_dict_to[pattern_tuple] = bucket_to = Bucket( pattern, len(pattern_milestone_offsets)) elif bucket_from is not None and bucket_to is None: del buckets_dict_from[pattern_tuple] buckets_dict_to[pattern_tuple] = bucket_to = bucket_from # else: bucket_from is None and bucket_to is not None: bucket_to.append(tokens, milestone_offsets, sim_hash) debug(f"Computed buckets for {len(tokenized_strings)} strings") return buckets_dict_to
def compute_group_runs_and_median_by(run_columns): debug(f'Computing group runs by {run_columns}') result = [] run_dict = None run_values = None run_lengths = [] all_j = load_data() for j in all_j: row_run_dict = {} row_values = {} for column, value in j.items(): if column in run_columns: row_run_dict[column] = value else: row_values[column] = value if row_run_dict != run_dict: # debug(f'Change: {row_run_dict} {run_dict}') if run_dict is not None: run_dict['_'] = run_values run_lengths.append(len(run_values)) result.append(run_dict) run_dict = row_run_dict run_values = [] run_values.append(row_values) if len(run_values) > 0: run_dict['_'] = run_values run_lengths.append(len(run_values)) result.append(run_dict) median_run_length = median(run_lengths) debug(f'Median run length: {median_run_length}') return result, median_run_length
def compute_stats_for_tokenized( tokenized_strings: Sequence[Sequence[str]]) -> Iterator[Stat]: size = len(tokenized_strings) debug(f"Computing stats for {size} lines") token2lines = defaultdict(list) # or better just set of line indices! for tokenized_string in tokenized_strings: token_set = set(tokenized_string) for token in token_set: token2lines[token].append(tokenized_string) f = compute_token_counts((token for tokenized_string in tokenized_strings for token in tokenized_string)) token2quality = {} total_quality = 0 # total_support = 0 for token, quality in f.items(): quality = len(token2lines[token]) * len(token2lines[token]) / quality token2quality[token] = quality total_quality += quality # total_support += len(token2lines[token]) # total_count = 0 # for count in f.values(): # total_count += count limit = 0.5 * total_quality total = 0 prev_support = 0 prev_quality = 0 prev_selected = True i = 0 for token, quality in sorted(token2quality.items(), key=lambda item: -item[1]): # count = f[token] support = len(token2lines[token]) # and i < len(s) selected = prev_selected and support > 1 and ( # total < limit or quality == prev_quality or support >= prev_support) total < limit or support == size) total += quality prev_quality = quality prev_support = support prev_selected = selected i += 1 yield Stat(token=token, quality=quality, count=f[token], support=support, selected=selected)
def compute_mutual_weights_iter( elements: List[Any], weight_f: Callable[[Any, Any], Optional[float]], node_f: Callable[[Any], Hashable]): debug(f"Computing weights, number of elements: {len(elements)}") for i in range(len(elements)): for j in range(i + 1, len(elements)): weight = weight_f(elements[i], elements[j]) if weight is None: continue yield node_f(elements[i]), node_f(elements[j]), weight
def main_for_json(base_folder: str, out_resource_name: str): debug('Loading data...') data = json.load(sys.stdin) tdf, analyse_column_present = tdf_from_aggregated_json( data, base_folder, out_resource_name) annotate(tdf) tweak_schema(tdf) debug('Saving annotated data...') OUTPUT_FORMAT = os.environ.get("OF", "tsv") tdf.save_as(out_resource_name, fmt=OUTPUT_FORMAT)
def handle_GET(self): debug(f'GET {self.path}') if self.path.startswith( "/insight/view") or self.path == "/favicon.ico": return self.handle_static() elif self.path.startswith("/insight/data"): resource = self.path[len("/insight/data/"):].split('?')[0] resource_parts = resource.split('/') df = tdf.resolve_df(resource_parts[1:]) j = df.to_json(orient='records') return 200, 'application/json', j.encode('utf-8') else: return 404, 'text/plain', bytes(self.path, 'utf-8')
def bucketize(tokenized_strings) -> Dict[Tuple[str, ...], List[str]]: debug(f"Computing buckets for {len(tokenized_strings)} strings") selected: Set[str] = compute_selected( compute_stats_for_tokenized(tokenized_strings)) pattern_to_tokenized_strings: Dict[Tuple[str, ...], List[str]] = defaultdict(list) for tokenized_string in tokenized_strings: raw_pattern, milestone_offsets = raw_pattern_and_milestone_offsets( tokenized_string, selected) pattern, pattern_milestone_offsets = collapse_successive_wildcards( raw_pattern) pattern_to_tokenized_strings[tuple(pattern)].append(tokenized_string) debug(f"Computed buckets for {len(tokenized_strings)} strings") return pattern_to_tokenized_strings
def compute_stats(strings: Sequence[str]) -> Iterator[Stat]: debug(f"Computing stats for {len(strings)} lines") token2lines = defaultdict(list) # or better just set of line indices! for s in strings: token_set = {token for token in tokenize(s)} for token in token_set: token2lines[token].append(s) token_counts: Dict[Hashable, int] = compute_token_counts(tokenize_lines(strings)) token2quality = {} total_quality = 0 total_support = 0 for token, count in token_counts.items(): quality = len(token2lines[token]) * len(token2lines[token]) / count token2quality[token] = quality total_quality += quality total_support += len(token2lines[token]) total_count = 0 for count in token_counts.values(): total_count += count limit = 0.5 * total_quality total = 0 prev_support = 0 prev_quality = 0 prev_selected = True i = 0 for token, quality in sorted(token2quality.items(), key=lambda item: -item[1]): # count = f[token] support = len(token2lines[token]) # and i < len(s) selected = prev_selected and support > 1 and ( total < limit or quality == prev_quality or support >= prev_support) total += quality prev_quality = quality prev_support = support prev_selected = selected i += 1 yield Stat(token=token, quality=quality, count=token_counts[token], support=support, selected=selected)
def annotate_lines(records: Sequence[Any], classify_field: str, result_field: str): debug(f"Annotating") classified_fields = [j[classify_field] for j in records] tokenized_strings = [[token for token in tokenize(s)] for s in classified_fields] buckets = make_buckets(tokenized_strings) group_to_lookup = invert(buckets) for record in records: message = record[classify_field] p = group_to_lookup[tuple(token for token in tokenize(message))] category = f'{hash(p) & 0xFFFFFFFF:02x}' record[result_field] = category
def auto_aggregate_by_groups(agg_groups): """ Quick-and-dirty, inefficient multi-group aggregation """ debug(f'Automatically computing group runs by {agg_groups}') data = load_data() if agg_groups is None or len(agg_groups) == 0: return data leading_columns = [c for g in agg_groups for c in g] leading_columns_group_run_lengths = { c: compute_group_runs_and_median_by([c])[1] for c in leading_columns } leading_columns.sort(key=leading_columns_group_run_lengths.get) leading_columns.reverse() debug(f'Column names, sorted by run lengths: {leading_columns}') return auto_aggregate_by_groups0(leading_columns)
def prune(singleton_transitions_graph: Dict[str, Dict[str, int]], result: Dict[str, Dict[str, int]]) -> None: items = singleton_transitions_graph.items() debug(f'Pruning graph, {len(items)} nodes') for source, edges in items: new_edges: Dict[str, int] = {} bidi_edges: int = 0 uni_edges: int = 0 for to in edges: edges_of_to: Dict[str, int] = singleton_transitions_graph.get(to) if edges_of_to is not None and source in edges_of_to: bidi_edges += 1 else: new_edges[to] = edges.get(to) uni_edges += 1 if uni_edges >= bidi_edges and source is not None: result[source] = new_edges
def make_buckets(tokenized_strings) -> Dict[Tuple[str, ...], List[str]]: refined_buckets = initial_refined_buckets(tokenized_strings) # Perform analysis and synthesis: find similar buckets, merge them, and re-do bucketing. debug("Making super-buckets") super_buckets_data = make_super_buckets(refined_buckets) debug("Refining super-buckets (1)") refined1 = refine_buckets(super_buckets_data) debug("Refining super-buckets (2)") refined2 = refine_buckets(refined1.values()) debug("done") return refined2
def gather_and_scatter( buckets_dict, similarity_metric: Callable[[Any, Any], Optional[float]]) -> Dict: debug("Computing bucket centroids") for bucket in buckets_dict.values(): bucket.compute_hashes_centroid_and_rmsd() debug("Computed bucket centroids") similarities: Dict[Hashable, Dict[Hashable, Any]] = compute_bucket_similarities_graph( list(buckets_dict.values()), similarity_metric) buckets_dict, crude_buckets = gather(buckets_dict, similarities) for crude_bucket in crude_buckets: # crude_bucket.compute_centroid_hashes() # for future scatter_into(buckets_dict, buckets_dict, crude_bucket.tokenized_strings) return buckets_dict
def gather(buckets, similarities: Dict[Hashable, Dict[Hashable, Any]]): debug("Computing connected components of buckets similarity graph") similar_buckets_pattern_list: List[List[Hashable]] = connected_components( similarities) debug( f"Computed {len(similar_buckets_pattern_list)} connected components of buckets similarity graph" ) crude_buckets = [] for similar_bucket_patterns in similar_buckets_pattern_list: crude_bucket = Bucket() for bucket_pattern in similar_bucket_patterns: bucket = buckets.get(bucket_pattern) del buckets[bucket_pattern] crude_bucket.tokenized_strings.extend(bucket.tokenized_strings) crude_bucket.hashes[0].extend(bucket.hashes[0]) crude_bucket.hashes[1].extend(bucket.hashes[1]) crude_buckets.append(crude_bucket) return buckets, crude_buckets
def main_for_tsv(base_folder: str, in_resource_name: str, out_resource_name: str): debug('Loading data...') INPUT_FORMAT = os.environ.get("IF", "tsv") tdf = TieredDataFrame(base_folder, in_resource_name, fmt=INPUT_FORMAT) debug('done') annotate(tdf) tweak_schema(tdf) debug('Saving annotated data...') OUTPUT_FORMAT = os.environ.get("OF", "tsv") tdf.save_as(out_resource_name, fmt=OUTPUT_FORMAT) debug('done')
def contiguous_strings( transitions: Dict[str, Tuple[Set[str], Set[str]]] ) -> typing.Set[typing.Tuple[str]]: items = transitions.items() debug(f'Computing contiguous strings, items db: {len(items)} entries') result = set() for item, in_and_out_links in items: if not item: continue in_links: Set[str] = in_and_out_links[0] out_links: Set[str] = in_and_out_links[1] if len(out_links) > 1 or None in out_links: continue if len(in_links) == 1: i = next(iter(in_links)) if i and len(transitions[i][1]) <= 1: continue string = [item] while True: next_items: Set[str] = transitions[item][1] if len(next_items) != 1: break item = next(iter(next_items)) if not item: break in_links_of_next: Set[str] = transitions[item][0] if len(in_links_of_next) != 1: break string.append(item) if len(string) <= 1: continue result.add(tuple(string)) return result
def __init__(self, parent, j, descriptor: Optional[Dict[str, Any]], pruned=None): self.parent = parent self.descriptor = descriptor self.paths_of_leaves = compute_paths_of_leaves(descriptor) self.pruned = pruned debug('compute_column_attrs') self.column_id_to_attrs: Dict[Hashable, ColumnAttrs] = {} for column_id in self.paths_of_leaves: self.column_id_to_attrs[column_id] = compute_column_attrs(j, column_id, child_by_path) debug('done') debug('compute_cross_column_attrs') compute_cross_column_attrs(j, self.column_id_to_attrs, child_by_path) debug('done')
def compute_non_milestone_transitions(tg_tdf: Iterable[Tuple[pd.Series, TieredDataFrame]], milestones: Container[str]) \ -> Dict[str, Tuple[Set[str], Set[str]]]: """ :param tg_tdf TieredDataFrame :return: for every non-milestone, a set of preceding items within non-milestone sequences, and a set of items that follow within non-milestone sequences. """ debug('Computing non-milestone transitions') result = defaultdict(lambda: (set(), set())) for tx_keys, tx_tdf in tg_tdf: for sub_sequence in traverse_tx_non_milestone_strings( tx_tdf, milestones): previous = None for item in sub_sequence: result[previous][1].add(item) result[item][0].add(previous) previous = item if previous: result[previous][1].add(None) result[None][0].add(previous) return result
def auto_aggregation_groups() -> Optional[List]: all_column_names: Iterable[str] = compute_all_column_names() column_families: List = compute_column_families(all_column_names) debug(f'Column families: {column_families}') if column_families is None or len(column_families) <= 1: debug('No auto-aggregation groups') return None agg_groups = list(reversed(column_families[1:])) debug(f'Auto-aggregation groups: {agg_groups}') return agg_groups
def initial_refined_buckets(tokenized_strings): buckets = bucketize(tokenized_strings) debug("Initial refinement of buckets") refined_buckets: Dict[Tuple[str, ...], List[str]] = refine_buckets(buckets.values()) debug("Completed initial refinement of buckets!") if len(refined_buckets) == 1: debug("Got only 1 initial refined bucket") return refined_buckets return refined_buckets
def make_super_buckets(refined_buckets: Dict[Tuple[str, ...], List[str]]): debug("Computing buckets similarity graph") nodes = list(refined_buckets.keys()) # debug() # debug(nodes) # debug() def new_metric(n1: Sequence, n2: Sequence) -> float: n1_set = set(n1) n2_set = set(n2) common = n1_set & n2_set return 0.0 if list(i for i in n1 if i in common) == list(i for i in n2 if i in common) else 1.0 def normalized_levenstein_distance_metric(n1: Sequence, n2: Sequence) -> float: return 2.0 * levenshtein_distance(n1, n2) / (len(n1) + len(n2)) def small_normalized_levenstein_distance_metric(d: float) -> bool: return d <= 0.5 buckets_similarity_graph: Dict[ Hashable, List[Hashable]] = discretize_graph( compute_weights_graph(nodes, new_metric, lambda n: n), small_normalized_levenstein_distance_metric) debug("Computed buckets similarity graph") debug("Computing connected components of buckets similarity graph") super_buckets = ConnectedComponents(buckets_similarity_graph).compute() debug("Computed connected components of buckets similarity graph") super_buckets_data = [] for super_bucket in super_buckets: super_bucket_lines = [] for p in super_bucket: super_bucket_lines.extend(refined_buckets[p]) super_buckets_data.append(super_bucket_lines) return super_buckets_data
def trim(self): i = -1 i_offset = -1 j = -1 j_offset = -1 debug('pattern length', len(self.pattern)) while True: while True: j += 1 if j >= len(self.pattern): break debug('j', j, self.pattern[j]) if self.pattern[j] is not None: j_offset += 1 break if j >= len(self.pattern): break # found milestone if i >= 0 and i + 1 < j: i += 1 # at the start of wildcard area while True: debug('scan columns', i_offset, j_offset) token = self.scan_column(self.alignment_offsets[i_offset], 1, self.alignment_offsets[j_offset]) if token is None: break self.pattern.insert(i, token) i += 1 self.alignment_offsets.insert( i_offset + 1, Bucket.fill_column(self.alignment_offsets[i_offset], 1)) # i_offset += 1 # j_offset += 1 if i == j: del self.pattern[j] i = j i_offset = j_offset
def compute_singletons_allow_runs(tg_tdf): debug(f'Computing singletons; data length={len(tg_tdf)}') singletons: Set[str] = set() failed_singletons: Set[str] = set() for tx_keys, tx_tdf in tg_tdf: debug('-------------------------------------------------------------') debug('tx_keys') debug(tx_keys) debug('tx_tdf') debug(tx_tdf) debug('-------------------------------------------------------------') seen_items: Set[str] = set() for item in collapse_repeats(traverse_tx_items(tx_tdf)): if item in seen_items: failed_singletons.add(item) else: singletons.add(item) seen_items.add(item) singletons -= failed_singletons return singletons
def annotate_group(tg_row, tg_tdf): # singletons = compute_singletons(tg_tdf) singletons = compute_singletons_allow_runs(tg_tdf) debug('Analyzing transitions') transitions = Transitions(singletons) for tx_keys, tx_tdf in tg_tdf: with transitions as tx: for index, row in tx_tdf.node_df.iterrows(): tx(row[MSG_KIND_COLUMN]) debug('Analyzing transitions (2)') transitions_2 = insight.logic.transitions2.Transitions() for tx_keys, tx_tdf in tg_tdf: with transitions_2 as tx: for index, row in tx_tdf.node_df.iterrows(): tx(row[MSG_KIND_COLUMN]) milestones: Dict[str, Dict[str, int]] = pruned(transitions.singleton_transitions) item_occurs_in_transactions, milestones_in_transactions = occurrences( tg_tdf, milestones) transaction_codes: List[str] = [ hash_code_hex8(hash(tuple(e))) for e in milestones_in_transactions ] debug('Attaching column "custom"') tg_tdf.node_df['custom'] = transaction_codes # non_milestone_transitions = compute_non_milestone_transitions(tg_tdf, milestones) # non_milestone_strings: typing.Set[typing.Tuple[str]] = contiguous_strings(non_milestone_transitions) # chains: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.chains(transitions_2.summary) debug('Computing chains') # non_milestone_strings: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.chains(transitions_2.summary) transition_cliques: typing.Dict[ str, typing.Set[str]] = insight.logic.transitions2.infer_transition_cliques( transitions_2.summary) # non_milestone_strings: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.infer_transition_cliques_tuples(transitions_2.summary) # debug(non_milestone_strings) debug('Computing non-milestone codes and feature codes') non_milestone_codes = {} transition_cliques_values = { frozenset(e) for e in transition_cliques.values() } for clique in set(transition_cliques_values): non_milestone_string_code = hash_code_hex8(hash(frozenset(clique))) debug(non_milestone_string_code, clique) for item in clique: non_milestone_codes[item] = non_milestone_string_code debug('Computing co-occurrence codes') co_occurrence_codes: Dict[str, str] = { k: hash_code_hex8(hash(tuple(v))) for k, v in item_occurs_in_transactions.items() } debug('Attaching column "milestones"') tg_tdf.node_df['milestones'] = [ ','.join(code for code in (co_occurrence_codes.get(i) for i in e)) for e in milestones_in_transactions ] def item_code(r) -> str: i = r[MSG_KIND_COLUMN] return co_occurrence_codes.get(i) or non_milestone_codes.get( i) or "FFFFFFFF" active_chain_counter2: int active_chain_ids2: typing.Dict[str, str] active_chain_states: typing.Dict[str, int] active_chain_nestedness: typing.Dict[str, int] active_nestedness: int def active_chain_traversal_reset(): nonlocal active_nestedness, active_chain_counter2, active_chain_ids2, active_chain_states, active_chain_nestedness active_nestedness = -1 active_chain_counter2 = 0 # in every transaction, ids will start from 1 (not globally unique) - ok for now active_chain_ids2 = dict() active_chain_states = dict() def code2( row ) -> typing.Tuple[typing.Union[str, None], typing.Union[str, None], bool, bool]: """ :return: Tuple[chain type: str, chain id: str, chain started: bool, chain finished: bool] """ nonlocal active_chain_counter2 item = row[MSG_KIND_COLUMN] chain_type_id: str = non_milestone_codes.get(item) if not chain_type_id: return None, None, False, False current_chain_id: str = active_chain_ids2.get(chain_type_id) if not current_chain_id: active_chain_counter2 += 1 current_chain_id = str(active_chain_counter2) active_chain_ids2[chain_type_id] = current_chain_id active_chain_states[ chain_type_id] = 1 # saw first item of the chain return chain_type_id, current_chain_id, True, False else: active_chain_state = active_chain_states[chain_type_id] + 1 length = len(transition_cliques[item]) # debug(transition_cliques[item], length, active_chain_state) if active_chain_state == length: # debug('DEL') del active_chain_ids2[chain_type_id] del active_chain_states[chain_type_id] return chain_type_id, current_chain_id, False, True else: active_chain_states[chain_type_id] = active_chain_state return chain_type_id, current_chain_id, False, False def chain_code2(row) -> str: return code2(row)[1] or '' def feature_code2(row) -> str: features = code2(row) non_milestone_code: str = features[0] if not non_milestone_code: return '*' return ('+' if features[2] else ('-' if features[3] else '=')) + non_milestone_code def compute_active_chain_nestedness(tx_tdf): nonlocal active_chain_nestedness # debug('=========================') nestedness: int = -1 # Running gauge; incremented when a chain starts, decremented when finished. for index, row in tx_tdf.node_df.iterrows(): features = code2(row) current_chain_id = features[1] if not current_chain_id: # debug(row['hash'], nestedness) continue if features[2]: # debug('++') nestedness += 1 # debug(row['hash'], nestedness, current_chain_id) chain_nestedness = active_chain_nestedness.get(current_chain_id) # debug(chain_nestedness) if chain_nestedness: new_chain_nestedness = min(chain_nestedness, nestedness) if new_chain_nestedness != chain_nestedness: active_chain_nestedness[ current_chain_id] = new_chain_nestedness # debug(current_chain_code, '->', new_chain_nestedness) else: active_chain_nestedness[current_chain_id] = nestedness if features[3]: # debug('--') nestedness -= 1 def chain_nestedness2_debug(row) -> int: nonlocal active_chain_nestedness, active_nestedness features = code2(row) current_chain_id: str = features[1] if not current_chain_id: return active_nestedness result = active_chain_nestedness.get( current_chain_id) or active_nestedness active_nestedness = result if features[3]: active_nestedness -= 1 return result debug('Attaching annotation columns') for tx_keys, tx_tdf in tg_tdf: tx_tdf.node_df['s'] = tx_tdf.node_df.apply( lambda r: '1' if r[MSG_KIND_COLUMN] in singletons else 0, axis=1) tx_tdf.node_df['xhash'] = tx_tdf.node_df.apply(item_code, axis=1) # tx_tdf.node_df['feature'] = tx_tdf.node_df.apply(lambda r: feature_codes.get(r['hash'], '*'), axis=1) active_chain_traversal_reset() active_chain_nestedness = dict() debug('Compute active chain nestedness') compute_active_chain_nestedness(tx_tdf) debug('Attaching "feature"') active_chain_traversal_reset() tx_tdf.node_df['feature'] = tx_tdf.node_df.apply(feature_code2, axis=1) debug('Attaching "break"') tx_tdf.node_df['break'] = compute_break_flags(tx_tdf, milestones) debug('Attaching "chain"') active_chain_traversal_reset() tx_tdf.node_df['chain'] = tx_tdf.node_df.apply(chain_code2, axis=1) debug('Attaching "nestedness"') active_chain_traversal_reset() # debug('=========================') # for k, v in active_chain_nestedness.items(): # debug(k, v) # debug('=========================') tx_tdf.node_df['nestedness'] = tx_tdf.node_df.apply( chain_nestedness2_debug, axis=1)
def tweak_schema(tdf, columns_settings=None): debug('-------------------------------------------------------------') debug('Tweaking schema') debug('') debug('Schema tier 0') debug(tdf.schema_tiers[0]) debug('Schema tier 1') debug(tdf.schema_tiers[1]) debug('Schema tier 2') debug(tdf.schema_tiers[2]) debug('') debug('Tweaks:') debug(columns_settings) debug('-------------------------------------------------------------') if columns_settings: tdf.schema_tiers[2] = tweak(tdf.schema_tiers[2], columns_settings) # hack: modify 'rid' column # tdf.schema_tiers[1][2]['name'] = 'message' # tdf.schema_tiers[1][2]['renderer'] = 'message' tdf.append_column(1, { 'name': 'xhash', 'renderer': 'generic', 'hidden': 'true' }) # column that holds color tdf.append_column(1, {'name': 'milestones', 'renderer': 'striped'}) tdf.append_column(2, { 'name': 's', 'renderer': 'hashHighlightedLiteralValue' }) tdf.append_column(2, { 'name': 'xhash', 'renderer': 'generic', 'hidden': 'true' }) tdf.append_column(2, { 'name': 'feature', 'renderer': 'feature', 'hidden': 'true' }) tdf.append_column( 2, { 'name': 'break', 'renderer': 'hashHighlightedLiteralValue', 'hidden': 'true' }) tdf.append_column(2, {'name': 'chain', 'renderer': 'generic'}) tdf.append_column(2, {'name': 'nestedness', 'renderer': 'generic'}) debug('-------------------------------------------------------------') debug('Tweaked schema') debug('') debug('Schema tier 0') debug(tdf.schema_tiers[0]) debug('Schema tier 1') debug(tdf.schema_tiers[1]) debug('Schema tier 2') debug(tdf.schema_tiers[2]) debug('-------------------------------------------------------------')
def annotate(tdf): debug('Annotating') for tg_row, tg_tdf in tdf: annotate_group(tg_row, tg_tdf) debug('done')
def load_lines(): debug("Loading data") lines = [line.rstrip('\n') for line in sys.stdin] debug("done") return lines