Ejemplo n.º 1
0
def gather_and_scatter2(
        buckets_dict, similarity_metric: Callable[[Any, Any],
                                                  Optional[float]]) -> Dict:

    debug("Computing bucket centroids")
    for bucket in buckets_dict.values():
        bucket.compute_hashes_centroid_and_rmsd()
    debug("Computed bucket centroids")

    similarities: Dict[Hashable,
                       Dict[Hashable,
                            Any]] = compute_bucket_similarities_graph(
                                list(buckets_dict.values()), similarity_metric)
    buckets_dict, crude_buckets = gather(buckets_dict, similarities)
    for crude_bucket in crude_buckets:
        crude_bucket.compute_hashes_centroid_and_rmsd()
        if crude_bucket.hashes_rmsd > 15:
            scatter_into(buckets_dict, buckets_dict,
                         crude_bucket.tokenized_strings)
        else:
            new_pattern = crude_bucket.tokenized_strings[0]
            crude_bucket.pattern = new_pattern
            buckets_dict[tuple(new_pattern)] = crude_bucket

    return buckets_dict
Ejemplo n.º 2
0
def read_fd_or_default(fd, default):
    try:
        with os.fdopen(fd, 'r') as f:
            debug(f'Reading from FD {fd}')
            return json.load(f)
    except Exception:
        return default
Ejemplo n.º 3
0
 def do_OPTIONS(self):
     debug(f'OPTIONS {self.path}')
     resource = self.path[len("/insight/data/"):].split('?')[0]
     resource_parts = resource.split('/')
     tier_schema = tdf.schema_tiers[len(resource_parts) - 1]
     self.respond(200, 'application/javascript',
                  json.dumps(tier_schema).encode('utf-8'))
Ejemplo n.º 4
0
def scatter_into(buckets_dict_to, buckets_dict_from, tokenized_strings):
    debug(f"Computing buckets for {len(tokenized_strings)} strings")
    stats = list(compute_stats_for_tokenized(tokenized_strings))
    token_to_quality = {stat.token: stat.quality for stat in stats}
    selected: Set[str] = compute_selected(stats)
    for tokens in tokenized_strings:
        sim_hash = seq_sim_hash(tokens, token_to_quality.get)
        raw_pattern, milestone_offsets = raw_pattern_and_milestone_offsets(
            tokens, selected)
        pattern, pattern_milestone_offsets = collapse_successive_wildcards(
            raw_pattern)
        pattern_tuple = tuple(pattern)

        bucket_from = buckets_dict_from.get(pattern_tuple)
        bucket_to = buckets_dict_to.get(pattern_tuple)
        if bucket_from is None and bucket_to is None:
            buckets_dict_to[pattern_tuple] = bucket_to = Bucket(
                pattern, len(pattern_milestone_offsets))
        elif bucket_from is not None and bucket_to is None:
            del buckets_dict_from[pattern_tuple]
            buckets_dict_to[pattern_tuple] = bucket_to = bucket_from
        # else: bucket_from is None and bucket_to is not None:

        bucket_to.append(tokens, milestone_offsets, sim_hash)
    debug(f"Computed buckets for {len(tokenized_strings)} strings")

    return buckets_dict_to
Ejemplo n.º 5
0
def compute_group_runs_and_median_by(run_columns):
    debug(f'Computing group runs by {run_columns}')
    result = []
    run_dict = None
    run_values = None
    run_lengths = []
    all_j = load_data()
    for j in all_j:
        row_run_dict = {}
        row_values = {}

        for column, value in j.items():
            if column in run_columns:
                row_run_dict[column] = value
            else:
                row_values[column] = value

        if row_run_dict != run_dict:
            # debug(f'Change: {row_run_dict} {run_dict}')
            if run_dict is not None:
                run_dict['_'] = run_values
                run_lengths.append(len(run_values))
                result.append(run_dict)
            run_dict = row_run_dict
            run_values = []

        run_values.append(row_values)
    if len(run_values) > 0:
        run_dict['_'] = run_values
        run_lengths.append(len(run_values))
        result.append(run_dict)
    median_run_length = median(run_lengths)
    debug(f'Median run length: {median_run_length}')
    return result, median_run_length
Ejemplo n.º 6
0
def compute_stats_for_tokenized(
        tokenized_strings: Sequence[Sequence[str]]) -> Iterator[Stat]:
    size = len(tokenized_strings)
    debug(f"Computing stats for {size} lines")
    token2lines = defaultdict(list)  # or better just set of line indices!
    for tokenized_string in tokenized_strings:
        token_set = set(tokenized_string)
        for token in token_set:
            token2lines[token].append(tokenized_string)

    f = compute_token_counts((token for tokenized_string in tokenized_strings
                              for token in tokenized_string))

    token2quality = {}
    total_quality = 0
    # total_support = 0
    for token, quality in f.items():
        quality = len(token2lines[token]) * len(token2lines[token]) / quality
        token2quality[token] = quality
        total_quality += quality
        # total_support += len(token2lines[token])

    # total_count = 0
    # for count in f.values():
    #     total_count += count

    limit = 0.5 * total_quality
    total = 0
    prev_support = 0
    prev_quality = 0
    prev_selected = True
    i = 0
    for token, quality in sorted(token2quality.items(),
                                 key=lambda item: -item[1]):
        # count = f[token]
        support = len(token2lines[token])
        # and i < len(s)
        selected = prev_selected and support > 1 and (
            # total < limit or quality == prev_quality or support >= prev_support)
            total < limit or support == size)

        total += quality
        prev_quality = quality
        prev_support = support
        prev_selected = selected
        i += 1

        yield Stat(token=token,
                   quality=quality,
                   count=f[token],
                   support=support,
                   selected=selected)
Ejemplo n.º 7
0
def compute_mutual_weights_iter(
        elements: List[Any],
        weight_f: Callable[[Any, Any], Optional[float]],
        node_f: Callable[[Any], Hashable]):

    debug(f"Computing weights, number of elements: {len(elements)}")

    for i in range(len(elements)):
        for j in range(i + 1, len(elements)):
            weight = weight_f(elements[i], elements[j])
            if weight is None:
                continue
            yield node_f(elements[i]), node_f(elements[j]), weight
Ejemplo n.º 8
0
def main_for_json(base_folder: str, out_resource_name: str):
    debug('Loading data...')
    data = json.load(sys.stdin)

    tdf, analyse_column_present = tdf_from_aggregated_json(
        data, base_folder, out_resource_name)

    annotate(tdf)
    tweak_schema(tdf)

    debug('Saving annotated data...')
    OUTPUT_FORMAT = os.environ.get("OF", "tsv")
    tdf.save_as(out_resource_name, fmt=OUTPUT_FORMAT)
Ejemplo n.º 9
0
 def handle_GET(self):
     debug(f'GET {self.path}')
     if self.path.startswith(
             "/insight/view") or self.path == "/favicon.ico":
         return self.handle_static()
     elif self.path.startswith("/insight/data"):
         resource = self.path[len("/insight/data/"):].split('?')[0]
         resource_parts = resource.split('/')
         df = tdf.resolve_df(resource_parts[1:])
         j = df.to_json(orient='records')
         return 200, 'application/json', j.encode('utf-8')
     else:
         return 404, 'text/plain', bytes(self.path, 'utf-8')
Ejemplo n.º 10
0
def bucketize(tokenized_strings) -> Dict[Tuple[str, ...], List[str]]:
    debug(f"Computing buckets for {len(tokenized_strings)} strings")
    selected: Set[str] = compute_selected(
        compute_stats_for_tokenized(tokenized_strings))
    pattern_to_tokenized_strings: Dict[Tuple[str, ...],
                                       List[str]] = defaultdict(list)
    for tokenized_string in tokenized_strings:
        raw_pattern, milestone_offsets = raw_pattern_and_milestone_offsets(
            tokenized_string, selected)
        pattern, pattern_milestone_offsets = collapse_successive_wildcards(
            raw_pattern)
        pattern_to_tokenized_strings[tuple(pattern)].append(tokenized_string)
    debug(f"Computed buckets for {len(tokenized_strings)} strings")
    return pattern_to_tokenized_strings
Ejemplo n.º 11
0
def compute_stats(strings: Sequence[str]) -> Iterator[Stat]:
    debug(f"Computing stats for {len(strings)} lines")
    token2lines = defaultdict(list)  # or better just set of line indices!
    for s in strings:
        token_set = {token for token in tokenize(s)}
        for token in token_set:
            token2lines[token].append(s)

    token_counts: Dict[Hashable,
                       int] = compute_token_counts(tokenize_lines(strings))

    token2quality = {}
    total_quality = 0
    total_support = 0
    for token, count in token_counts.items():
        quality = len(token2lines[token]) * len(token2lines[token]) / count
        token2quality[token] = quality
        total_quality += quality
        total_support += len(token2lines[token])

    total_count = 0
    for count in token_counts.values():
        total_count += count

    limit = 0.5 * total_quality
    total = 0
    prev_support = 0
    prev_quality = 0
    prev_selected = True
    i = 0
    for token, quality in sorted(token2quality.items(),
                                 key=lambda item: -item[1]):
        # count = f[token]
        support = len(token2lines[token])
        # and i < len(s)
        selected = prev_selected and support > 1 and (
            total < limit or quality == prev_quality
            or support >= prev_support)

        total += quality
        prev_quality = quality
        prev_support = support
        prev_selected = selected
        i += 1

        yield Stat(token=token,
                   quality=quality,
                   count=token_counts[token],
                   support=support,
                   selected=selected)
Ejemplo n.º 12
0
def annotate_lines(records: Sequence[Any], classify_field: str,
                   result_field: str):
    debug(f"Annotating")
    classified_fields = [j[classify_field] for j in records]
    tokenized_strings = [[token for token in tokenize(s)]
                         for s in classified_fields]

    buckets = make_buckets(tokenized_strings)
    group_to_lookup = invert(buckets)

    for record in records:
        message = record[classify_field]
        p = group_to_lookup[tuple(token for token in tokenize(message))]
        category = f'{hash(p) & 0xFFFFFFFF:02x}'
        record[result_field] = category
Ejemplo n.º 13
0
def auto_aggregate_by_groups(agg_groups):
    """ Quick-and-dirty, inefficient multi-group aggregation """
    debug(f'Automatically computing group runs by {agg_groups}')
    data = load_data()
    if agg_groups is None or len(agg_groups) == 0:
        return data
    leading_columns = [c for g in agg_groups for c in g]
    leading_columns_group_run_lengths = {
        c: compute_group_runs_and_median_by([c])[1]
        for c in leading_columns
    }
    leading_columns.sort(key=leading_columns_group_run_lengths.get)
    leading_columns.reverse()
    debug(f'Column names, sorted by run lengths: {leading_columns}')

    return auto_aggregate_by_groups0(leading_columns)
Ejemplo n.º 14
0
def prune(singleton_transitions_graph: Dict[str, Dict[str, int]],
          result: Dict[str, Dict[str, int]]) -> None:
    items = singleton_transitions_graph.items()
    debug(f'Pruning graph, {len(items)} nodes')
    for source, edges in items:
        new_edges: Dict[str, int] = {}
        bidi_edges: int = 0
        uni_edges: int = 0
        for to in edges:
            edges_of_to: Dict[str, int] = singleton_transitions_graph.get(to)
            if edges_of_to is not None and source in edges_of_to:
                bidi_edges += 1
            else:
                new_edges[to] = edges.get(to)
                uni_edges += 1
        if uni_edges >= bidi_edges and source is not None:
            result[source] = new_edges
Ejemplo n.º 15
0
def make_buckets(tokenized_strings) -> Dict[Tuple[str, ...], List[str]]:
    refined_buckets = initial_refined_buckets(tokenized_strings)

    # Perform analysis and synthesis: find similar buckets, merge them, and re-do bucketing.
    debug("Making super-buckets")
    super_buckets_data = make_super_buckets(refined_buckets)
    debug("Refining super-buckets (1)")
    refined1 = refine_buckets(super_buckets_data)
    debug("Refining super-buckets (2)")
    refined2 = refine_buckets(refined1.values())
    debug("done")
    return refined2
Ejemplo n.º 16
0
def gather_and_scatter(
        buckets_dict, similarity_metric: Callable[[Any, Any],
                                                  Optional[float]]) -> Dict:

    debug("Computing bucket centroids")
    for bucket in buckets_dict.values():
        bucket.compute_hashes_centroid_and_rmsd()
    debug("Computed bucket centroids")

    similarities: Dict[Hashable,
                       Dict[Hashable,
                            Any]] = compute_bucket_similarities_graph(
                                list(buckets_dict.values()), similarity_metric)
    buckets_dict, crude_buckets = gather(buckets_dict, similarities)
    for crude_bucket in crude_buckets:
        # crude_bucket.compute_centroid_hashes()  # for future
        scatter_into(buckets_dict, buckets_dict,
                     crude_bucket.tokenized_strings)

    return buckets_dict
Ejemplo n.º 17
0
def gather(buckets, similarities: Dict[Hashable, Dict[Hashable, Any]]):
    debug("Computing connected components of buckets similarity graph")
    similar_buckets_pattern_list: List[List[Hashable]] = connected_components(
        similarities)
    debug(
        f"Computed {len(similar_buckets_pattern_list)} connected components of buckets similarity graph"
    )

    crude_buckets = []
    for similar_bucket_patterns in similar_buckets_pattern_list:
        crude_bucket = Bucket()
        for bucket_pattern in similar_bucket_patterns:
            bucket = buckets.get(bucket_pattern)
            del buckets[bucket_pattern]

            crude_bucket.tokenized_strings.extend(bucket.tokenized_strings)
            crude_bucket.hashes[0].extend(bucket.hashes[0])
            crude_bucket.hashes[1].extend(bucket.hashes[1])
        crude_buckets.append(crude_bucket)
    return buckets, crude_buckets
Ejemplo n.º 18
0
def main_for_tsv(base_folder: str, in_resource_name: str,
                 out_resource_name: str):
    debug('Loading data...')
    INPUT_FORMAT = os.environ.get("IF", "tsv")
    tdf = TieredDataFrame(base_folder, in_resource_name, fmt=INPUT_FORMAT)
    debug('done')

    annotate(tdf)
    tweak_schema(tdf)

    debug('Saving annotated data...')
    OUTPUT_FORMAT = os.environ.get("OF", "tsv")
    tdf.save_as(out_resource_name, fmt=OUTPUT_FORMAT)
    debug('done')
Ejemplo n.º 19
0
def contiguous_strings(
    transitions: Dict[str, Tuple[Set[str], Set[str]]]
) -> typing.Set[typing.Tuple[str]]:
    items = transitions.items()
    debug(f'Computing contiguous strings, items db: {len(items)} entries')
    result = set()
    for item, in_and_out_links in items:
        if not item:
            continue

        in_links: Set[str] = in_and_out_links[0]
        out_links: Set[str] = in_and_out_links[1]

        if len(out_links) > 1 or None in out_links:
            continue

        if len(in_links) == 1:
            i = next(iter(in_links))
            if i and len(transitions[i][1]) <= 1:
                continue

        string = [item]
        while True:
            next_items: Set[str] = transitions[item][1]
            if len(next_items) != 1:
                break
            item = next(iter(next_items))
            if not item:
                break
            in_links_of_next: Set[str] = transitions[item][0]
            if len(in_links_of_next) != 1:
                break
            string.append(item)

        if len(string) <= 1:
            continue

        result.add(tuple(string))

    return result
Ejemplo n.º 20
0
    def __init__(self, parent, j, descriptor: Optional[Dict[str, Any]], pruned=None):
        self.parent = parent
        self.descriptor = descriptor
        self.paths_of_leaves = compute_paths_of_leaves(descriptor)
        self.pruned = pruned
        debug('compute_column_attrs')
        self.column_id_to_attrs: Dict[Hashable, ColumnAttrs] = {}

        for column_id in self.paths_of_leaves:
            self.column_id_to_attrs[column_id] = compute_column_attrs(j, column_id, child_by_path)
        debug('done')

        debug('compute_cross_column_attrs')
        compute_cross_column_attrs(j, self.column_id_to_attrs, child_by_path)
        debug('done')
Ejemplo n.º 21
0
def compute_non_milestone_transitions(tg_tdf: Iterable[Tuple[pd.Series, TieredDataFrame]], milestones: Container[str]) \
        -> Dict[str, Tuple[Set[str], Set[str]]]:
    """
    :param tg_tdf TieredDataFrame
    :return: for every non-milestone,
    a set of preceding items within non-milestone sequences,
    and a set of items that follow within non-milestone sequences.
    """
    debug('Computing non-milestone transitions')
    result = defaultdict(lambda: (set(), set()))
    for tx_keys, tx_tdf in tg_tdf:
        for sub_sequence in traverse_tx_non_milestone_strings(
                tx_tdf, milestones):
            previous = None
            for item in sub_sequence:
                result[previous][1].add(item)
                result[item][0].add(previous)
                previous = item
            if previous:
                result[previous][1].add(None)
                result[None][0].add(previous)

    return result
Ejemplo n.º 22
0
def auto_aggregation_groups() -> Optional[List]:
    all_column_names: Iterable[str] = compute_all_column_names()
    column_families: List = compute_column_families(all_column_names)
    debug(f'Column families: {column_families}')
    if column_families is None or len(column_families) <= 1:
        debug('No auto-aggregation groups')
        return None
    agg_groups = list(reversed(column_families[1:]))
    debug(f'Auto-aggregation groups: {agg_groups}')
    return agg_groups
Ejemplo n.º 23
0
def initial_refined_buckets(tokenized_strings):
    buckets = bucketize(tokenized_strings)
    debug("Initial refinement of buckets")
    refined_buckets: Dict[Tuple[str, ...],
                          List[str]] = refine_buckets(buckets.values())
    debug("Completed initial refinement of buckets!")
    if len(refined_buckets) == 1:
        debug("Got only 1 initial refined bucket")
        return refined_buckets
    return refined_buckets
Ejemplo n.º 24
0
def make_super_buckets(refined_buckets: Dict[Tuple[str, ...], List[str]]):
    debug("Computing buckets similarity graph")
    nodes = list(refined_buckets.keys())

    # debug()
    # debug(nodes)
    # debug()

    def new_metric(n1: Sequence, n2: Sequence) -> float:
        n1_set = set(n1)
        n2_set = set(n2)
        common = n1_set & n2_set
        return 0.0 if list(i for i in n1
                           if i in common) == list(i for i in n2
                                                   if i in common) else 1.0

    def normalized_levenstein_distance_metric(n1: Sequence,
                                              n2: Sequence) -> float:
        return 2.0 * levenshtein_distance(n1, n2) / (len(n1) + len(n2))

    def small_normalized_levenstein_distance_metric(d: float) -> bool:
        return d <= 0.5

    buckets_similarity_graph: Dict[
        Hashable, List[Hashable]] = discretize_graph(
            compute_weights_graph(nodes, new_metric, lambda n: n),
            small_normalized_levenstein_distance_metric)
    debug("Computed buckets similarity graph")

    debug("Computing connected components of buckets similarity graph")
    super_buckets = ConnectedComponents(buckets_similarity_graph).compute()
    debug("Computed connected components of buckets similarity graph")

    super_buckets_data = []
    for super_bucket in super_buckets:
        super_bucket_lines = []
        for p in super_bucket:
            super_bucket_lines.extend(refined_buckets[p])
        super_buckets_data.append(super_bucket_lines)
    return super_buckets_data
Ejemplo n.º 25
0
    def trim(self):
        i = -1
        i_offset = -1
        j = -1
        j_offset = -1
        debug('pattern length', len(self.pattern))
        while True:
            while True:
                j += 1
                if j >= len(self.pattern):
                    break
                debug('j', j, self.pattern[j])
                if self.pattern[j] is not None:
                    j_offset += 1
                    break
            if j >= len(self.pattern):
                break
            # found milestone

            if i >= 0 and i + 1 < j:
                i += 1  # at the start of wildcard area
                while True:
                    debug('scan columns', i_offset, j_offset)
                    token = self.scan_column(self.alignment_offsets[i_offset],
                                             1,
                                             self.alignment_offsets[j_offset])
                    if token is None:
                        break
                    self.pattern.insert(i, token)
                    i += 1
                    self.alignment_offsets.insert(
                        i_offset + 1,
                        Bucket.fill_column(self.alignment_offsets[i_offset],
                                           1))
                    # i_offset += 1
                    # j_offset += 1
                if i == j:
                    del self.pattern[j]

            i = j
            i_offset = j_offset
Ejemplo n.º 26
0
def compute_singletons_allow_runs(tg_tdf):
    debug(f'Computing singletons; data length={len(tg_tdf)}')
    singletons: Set[str] = set()
    failed_singletons: Set[str] = set()
    for tx_keys, tx_tdf in tg_tdf:
        debug('-------------------------------------------------------------')
        debug('tx_keys')
        debug(tx_keys)
        debug('tx_tdf')
        debug(tx_tdf)
        debug('-------------------------------------------------------------')

        seen_items: Set[str] = set()
        for item in collapse_repeats(traverse_tx_items(tx_tdf)):
            if item in seen_items:
                failed_singletons.add(item)
            else:
                singletons.add(item)
            seen_items.add(item)
    singletons -= failed_singletons
    return singletons
Ejemplo n.º 27
0
def annotate_group(tg_row, tg_tdf):
    # singletons = compute_singletons(tg_tdf)
    singletons = compute_singletons_allow_runs(tg_tdf)
    debug('Analyzing transitions')
    transitions = Transitions(singletons)
    for tx_keys, tx_tdf in tg_tdf:
        with transitions as tx:
            for index, row in tx_tdf.node_df.iterrows():
                tx(row[MSG_KIND_COLUMN])
    debug('Analyzing transitions (2)')
    transitions_2 = insight.logic.transitions2.Transitions()
    for tx_keys, tx_tdf in tg_tdf:
        with transitions_2 as tx:
            for index, row in tx_tdf.node_df.iterrows():
                tx(row[MSG_KIND_COLUMN])
    milestones: Dict[str,
                     Dict[str,
                          int]] = pruned(transitions.singleton_transitions)
    item_occurs_in_transactions, milestones_in_transactions = occurrences(
        tg_tdf, milestones)
    transaction_codes: List[str] = [
        hash_code_hex8(hash(tuple(e))) for e in milestones_in_transactions
    ]
    debug('Attaching column "custom"')
    tg_tdf.node_df['custom'] = transaction_codes
    # non_milestone_transitions = compute_non_milestone_transitions(tg_tdf, milestones)
    # non_milestone_strings: typing.Set[typing.Tuple[str]] = contiguous_strings(non_milestone_transitions)
    # chains: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.chains(transitions_2.summary)
    debug('Computing chains')
    # non_milestone_strings: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.chains(transitions_2.summary)
    transition_cliques: typing.Dict[
        str,
        typing.Set[str]] = insight.logic.transitions2.infer_transition_cliques(
            transitions_2.summary)
    # non_milestone_strings: typing.Set[typing.Tuple[str]] = insight.logic.transitions2.infer_transition_cliques_tuples(transitions_2.summary)
    # debug(non_milestone_strings)
    debug('Computing non-milestone codes and feature codes')
    non_milestone_codes = {}
    transition_cliques_values = {
        frozenset(e)
        for e in transition_cliques.values()
    }
    for clique in set(transition_cliques_values):
        non_milestone_string_code = hash_code_hex8(hash(frozenset(clique)))
        debug(non_milestone_string_code, clique)
        for item in clique:
            non_milestone_codes[item] = non_milestone_string_code
    debug('Computing co-occurrence codes')
    co_occurrence_codes: Dict[str, str] = {
        k: hash_code_hex8(hash(tuple(v)))
        for k, v in item_occurs_in_transactions.items()
    }
    debug('Attaching column "milestones"')
    tg_tdf.node_df['milestones'] = [
        ','.join(code for code in (co_occurrence_codes.get(i) for i in e))
        for e in milestones_in_transactions
    ]

    def item_code(r) -> str:
        i = r[MSG_KIND_COLUMN]
        return co_occurrence_codes.get(i) or non_milestone_codes.get(
            i) or "FFFFFFFF"

    active_chain_counter2: int
    active_chain_ids2: typing.Dict[str, str]
    active_chain_states: typing.Dict[str, int]
    active_chain_nestedness: typing.Dict[str, int]
    active_nestedness: int

    def active_chain_traversal_reset():
        nonlocal active_nestedness, active_chain_counter2, active_chain_ids2, active_chain_states, active_chain_nestedness
        active_nestedness = -1
        active_chain_counter2 = 0  # in every transaction, ids will start from 1 (not globally unique) - ok for now
        active_chain_ids2 = dict()
        active_chain_states = dict()

    def code2(
        row
    ) -> typing.Tuple[typing.Union[str, None], typing.Union[str, None], bool,
                      bool]:
        """
        :return: Tuple[chain type: str, chain id: str, chain started: bool, chain finished: bool]
        """
        nonlocal active_chain_counter2
        item = row[MSG_KIND_COLUMN]
        chain_type_id: str = non_milestone_codes.get(item)
        if not chain_type_id: return None, None, False, False

        current_chain_id: str = active_chain_ids2.get(chain_type_id)
        if not current_chain_id:
            active_chain_counter2 += 1
            current_chain_id = str(active_chain_counter2)
            active_chain_ids2[chain_type_id] = current_chain_id
            active_chain_states[
                chain_type_id] = 1  # saw first item of the chain
            return chain_type_id, current_chain_id, True, False
        else:
            active_chain_state = active_chain_states[chain_type_id] + 1
            length = len(transition_cliques[item])
            # debug(transition_cliques[item], length, active_chain_state)
            if active_chain_state == length:
                # debug('DEL')
                del active_chain_ids2[chain_type_id]
                del active_chain_states[chain_type_id]
                return chain_type_id, current_chain_id, False, True
            else:
                active_chain_states[chain_type_id] = active_chain_state
                return chain_type_id, current_chain_id, False, False

    def chain_code2(row) -> str:
        return code2(row)[1] or ''

    def feature_code2(row) -> str:
        features = code2(row)
        non_milestone_code: str = features[0]
        if not non_milestone_code:
            return '*'
        return ('+' if features[2] else
                ('-' if features[3] else '=')) + non_milestone_code

    def compute_active_chain_nestedness(tx_tdf):
        nonlocal active_chain_nestedness
        # debug('=========================')
        nestedness: int = -1  # Running gauge; incremented when a chain starts, decremented when finished.
        for index, row in tx_tdf.node_df.iterrows():
            features = code2(row)

            current_chain_id = features[1]
            if not current_chain_id:
                # debug(row['hash'], nestedness)
                continue

            if features[2]:
                # debug('++')
                nestedness += 1

            # debug(row['hash'], nestedness, current_chain_id)
            chain_nestedness = active_chain_nestedness.get(current_chain_id)
            # debug(chain_nestedness)
            if chain_nestedness:
                new_chain_nestedness = min(chain_nestedness, nestedness)
                if new_chain_nestedness != chain_nestedness:
                    active_chain_nestedness[
                        current_chain_id] = new_chain_nestedness
                    # debug(current_chain_code, '->', new_chain_nestedness)
            else:
                active_chain_nestedness[current_chain_id] = nestedness
            if features[3]:
                # debug('--')
                nestedness -= 1

    def chain_nestedness2_debug(row) -> int:
        nonlocal active_chain_nestedness, active_nestedness
        features = code2(row)
        current_chain_id: str = features[1]
        if not current_chain_id:
            return active_nestedness

        result = active_chain_nestedness.get(
            current_chain_id) or active_nestedness
        active_nestedness = result
        if features[3]:
            active_nestedness -= 1
        return result

    debug('Attaching annotation columns')
    for tx_keys, tx_tdf in tg_tdf:
        tx_tdf.node_df['s'] = tx_tdf.node_df.apply(
            lambda r: '1' if r[MSG_KIND_COLUMN] in singletons else 0, axis=1)
        tx_tdf.node_df['xhash'] = tx_tdf.node_df.apply(item_code, axis=1)
        # tx_tdf.node_df['feature'] = tx_tdf.node_df.apply(lambda r: feature_codes.get(r['hash'], '*'), axis=1)

        active_chain_traversal_reset()
        active_chain_nestedness = dict()
        debug('Compute active chain nestedness')
        compute_active_chain_nestedness(tx_tdf)

        debug('Attaching "feature"')
        active_chain_traversal_reset()
        tx_tdf.node_df['feature'] = tx_tdf.node_df.apply(feature_code2, axis=1)

        debug('Attaching "break"')
        tx_tdf.node_df['break'] = compute_break_flags(tx_tdf, milestones)

        debug('Attaching "chain"')
        active_chain_traversal_reset()
        tx_tdf.node_df['chain'] = tx_tdf.node_df.apply(chain_code2, axis=1)

        debug('Attaching "nestedness"')
        active_chain_traversal_reset()
        # debug('=========================')
        # for k, v in active_chain_nestedness.items():
        #     debug(k, v)
        # debug('=========================')
        tx_tdf.node_df['nestedness'] = tx_tdf.node_df.apply(
            chain_nestedness2_debug, axis=1)
Ejemplo n.º 28
0
def tweak_schema(tdf, columns_settings=None):
    debug('-------------------------------------------------------------')
    debug('Tweaking schema')
    debug('')
    debug('Schema tier 0')
    debug(tdf.schema_tiers[0])
    debug('Schema tier 1')
    debug(tdf.schema_tiers[1])
    debug('Schema tier 2')
    debug(tdf.schema_tiers[2])
    debug('')
    debug('Tweaks:')
    debug(columns_settings)
    debug('-------------------------------------------------------------')

    if columns_settings:
        tdf.schema_tiers[2] = tweak(tdf.schema_tiers[2], columns_settings)

    # hack: modify 'rid' column
    # tdf.schema_tiers[1][2]['name'] = 'message'
    # tdf.schema_tiers[1][2]['renderer'] = 'message'
    tdf.append_column(1, {
        'name': 'xhash',
        'renderer': 'generic',
        'hidden': 'true'
    })  # column that holds color
    tdf.append_column(1, {'name': 'milestones', 'renderer': 'striped'})

    tdf.append_column(2, {
        'name': 's',
        'renderer': 'hashHighlightedLiteralValue'
    })
    tdf.append_column(2, {
        'name': 'xhash',
        'renderer': 'generic',
        'hidden': 'true'
    })
    tdf.append_column(2, {
        'name': 'feature',
        'renderer': 'feature',
        'hidden': 'true'
    })
    tdf.append_column(
        2, {
            'name': 'break',
            'renderer': 'hashHighlightedLiteralValue',
            'hidden': 'true'
        })
    tdf.append_column(2, {'name': 'chain', 'renderer': 'generic'})
    tdf.append_column(2, {'name': 'nestedness', 'renderer': 'generic'})
    debug('-------------------------------------------------------------')
    debug('Tweaked schema')
    debug('')
    debug('Schema tier 0')
    debug(tdf.schema_tiers[0])
    debug('Schema tier 1')
    debug(tdf.schema_tiers[1])
    debug('Schema tier 2')
    debug(tdf.schema_tiers[2])
    debug('-------------------------------------------------------------')
Ejemplo n.º 29
0
def annotate(tdf):
    debug('Annotating')
    for tg_row, tg_tdf in tdf:
        annotate_group(tg_row, tg_tdf)
    debug('done')
Ejemplo n.º 30
0
def load_lines():
    debug("Loading data")
    lines = [line.rstrip('\n') for line in sys.stdin]
    debug("done")
    return lines