Ejemplo n.º 1
0
 def query(self, query, names=None, propagate=False, keyerror='ignore'):
     if names is None:
         names = list(self.dfs.keys())
     elif isinstance(names, str):
         names = [names]
     dfs = dict(self.dfs)
     seen = set()
     for name1 in names:
         if name1 not in seen:
             try:
                 dfs[name1] = dfs[name1].query(query)
                 seen.add(name1)
             except UndefinedVariableError:
                 if keyerror == "ignore":
                     continue
                 else:
                     raise
             if propagate:
                 for name2, df2 in dfs.items():
                     if name2 not in seen:
                         dfs[name2] = merge_with_spans(dfs[name1][[
                             c for c in dfs[name1].columns
                             if c in df2.columns and c.endswith("_id")
                         ]],
                                                       df2,
                                                       how="inner")
                         seen.add(name2)
     # main_df = self.dfs[self.main].query(query).reset_index(drop=True)
     # dfs = [main_df, *((merge_with_spans(main_df[[c for c in main_df.columns if c in df.columns and c.endswith("_id")]], df, how="inner")
     #                    if df is not None else None)
     #                   for df in list(self.dfs.values())[1:])]
     return Dataset(**{key: df.copy() for key, df in dfs.items()})
Ejemplo n.º 2
0
def apply_deltas(positions, deltas, on, position_columns=None):
    if not isinstance(on, (tuple, list)):
        on = [on]
    if position_columns is None:
        position_columns = {'begin': 'left', 'end': 'right'}

    positions = positions.copy()
    positions['_id_col'] = np.arange(len(positions))

    mention_deltas = merge_with_spans(
        positions[[*position_columns, *on, '_id_col']],
        deltas,
        on=on,
        suffixes=('_pos', '_delta'),
        how='inner')
    # To be faster, we remove categorical columns (they may only be in 'on') before the remaining ops
    mention_deltas = mention_deltas[[
        c for c in mention_deltas.columns if c not in on
    ]]
    positions = positions.set_index('_id_col')
    mention_deltas = mention_deltas.set_index('_id_col')

    delta_col_map, positions_col_map = make_merged_names_map(
        deltas.columns, [*position_columns, *on, '_id_col'],
        left_on=on,
        right_on=on,
        suffixes=('_delta', '_pos'))
    for col, side in position_columns.items():
        mention_deltas.eval(
            f"shift = ({delta_col_map['end']} <= {positions_col_map[col]}) * {delta_col_map['delta']}",
            inplace=True)
        mention_deltas.eval(
            f"between_magnet = {delta_col_map['begin']} < {positions_col_map[col]} and {positions_col_map[col]} < {delta_col_map['end']}",
            inplace=True)
        if side == "left":
            mention_deltas.eval(
                f"between_magnet = between_magnet * ({delta_col_map['begin']} - {positions_col_map[col]})",
                inplace=True)
        elif side == "right":
            mention_deltas.eval(
                f"between_magnet = between_magnet * ({delta_col_map['end']} + {delta_col_map['delta']} - {positions_col_map[col]})",
                inplace=True)
        order = "first" if side == "left" else "last"
        tmp = mention_deltas.sort_values([
            '_id_col', delta_col_map['begin' if side == 'left' else 'end']
        ]).groupby('_id_col').agg({
            "shift": "sum",
            **{
                n: order
                for n in mention_deltas.columns if n not in ("shift", "_id_col")
            }
        })
        positions[col] = positions[col].add(tmp['shift'] +
                                            tmp['between_magnet'],
                                            fill_value=0)
    positions = positions.reset_index(drop=True)
    return positions
Ejemplo n.º 3
0
 def take(self, ids):
     main_df = self.dfs[self.main].iloc[ids].reset_index(drop=True)
     dfs = [
         main_df,
         *((merge_with_spans(main_df[[
             c for c in main_df.columns
             if c in df.columns and c.endswith('_id')
         ]],
                             df,
                             how="inner") if df is not None else None)
           for df in list(self.dfs.values())[1:])
     ]
     return Dataset(**dict(zip(self.dfs.keys(), dfs)))
Ejemplo n.º 4
0
def load_n2c2_2019_task3(validation_split=0.2, random_state=42, split="train"):
    path = env.resource("n2c2/".format(split))
    dataset = []
    for filename in sorted(os.listdir(path / '{}_norm'.format(split))):
        if filename.endswith('.norm'):
            with open(path / '{}_norm'.format(split) / filename) as f:
                for line in f:
                    (mention_id, label, *spans) = line.strip('\n').split('||')
                    begins, ends = [int(b) for b in spans[::2]
                                    ], [int(e) for e in spans[1::2]]
                    dataset.append({
                        "doc_id": filename.replace('.norm', ''),
                        "mention_id": mention_id,
                        "label": label,
                        "begin": begins,
                        "end": ends,
                    })
    texts = []
    for filename in sorted(os.listdir(path / '{}_note'.format(split))):
        if filename.endswith('.txt'):
            with open(path / '{}_note'.format(split) / filename) as f:
                texts.append({
                    "doc_id": filename.replace('.txt', ''),
                    "text": f.read().strip('\n')
                })
    with open(path / '{}_file_list.txt'.format(split)) as f:
        train_files = pd.Series([n.strip('\n') for n in f.readlines()])
        train_files.name = 'doc_id'

    docs = merge_with_spans(train_files, pd.DataFrame(texts), on='doc_id')
    rng = check_random_state(random_state)
    if split == "train":
        docs['split'] = rng.choice(['train', 'val'],
                                   size=len(docs),
                                   p=[1 - validation_split, validation_split])
    else:
        docs['split'] = 'test'

    mentions = pd.DataFrame(dataset)
    fragments = mentions[['doc_id', 'mention_id', 'begin',
                          'end']].nlstruct.flatten("fragment_id",
                                                   tile_index=False).astype(
                                                       {"fragment_id": object})
    return Dataset(docs=docs[["doc_id", "text", "split"]],
                   mentions=mentions[["doc_id", "mention_id", "label"]],
                   fragments=fragments[[
                       "doc_id", "mention_id", "fragment_id", "begin", "end"
                   ]])
Ejemplo n.º 5
0
def merge_pred_and_gold(pred,
                        gold,
                        on=('doc_id', ('begin', 'end'), 'label'),
                        span_policy='partial_strict',
                        atom_pred_level=None,
                        atom_gold_level=None,
                        suffixes=('_pred', '_gold')):
    """
    Performs an outer merge between pred and gold that can be in 3 configurations:
    - (pred == nan, gold != nan) => pred_count = 0, gold_count = 1, tp = 0
    - (pred != nan, gold == nan) => pred_count = 1, gold_count = 0, tp = 0
    - (pred != nan, gold != nan) => pred_count = 1, gold_count = 1, tp = 1
    How the merge is done is by trying to merge on the columns given in "on" and using the span policy "policy"
    to merge spans

    Parameters
    ----------
    pred: pd.DataFrame
    gold: pd.DataFrame
    on: typing.Sequence of (str or tuple)
    span_policy: str
    atom_pred_level: (typing.Sequence of str) or str
    atom_gold_level: (typing.Sequence of str) or str

    Returns
    -------
    pd.DataFrame
    """
    delete_atom_pred_level = delete_atom_gold_level = False
    if isinstance(atom_pred_level, (list, tuple)):
        pred = pred.assign(_pred_id=pred[atom_pred_level].nlstruct.factorize())
        atom_pred_level = '_pred_id'
        delete_atom_pred_level = True
    elif atom_pred_level is None:
        pred = pred.assign(_pred_id=np.arange(len(pred)))
        atom_pred_level = '_pred_id'
        delete_atom_pred_level = True
    if isinstance(atom_gold_level, (list, tuple)):
        gold = gold.assign(_gold_id=gold[atom_gold_level].nlstruct.factorize())
        atom_gold_level = '_gold_id'
        delete_atom_gold_level = True
    elif atom_gold_level is None:
        gold = gold.assign(_gold_id=np.arange(len(gold)))
        atom_gold_level = '_gold_id'
        delete_atom_gold_level = True

    # pred_names, gold_names = make_merged_names(pred.columns, gold.columns, left_on=on, right_on=on,
    #                                           left_columns=pred.columns, right_columns=gold.columns)
    # pred_names_map = dict(zip(pred.columns, pred_names))
    # gold_names_map = dict(zip(gold.columns, gold_names))
    # categoricals = {}
    # for col in pred.columns:
    #     if hasattr(pred[col], 'cat'):
    #         categoricals[pred_names_map[col]] = pred[col].cat.categories
    #         pred[col] = pred[col].cat.codes
    # for col in gold.columns:
    #     if hasattr(gold[col], 'cat'):
    #         categoricals[gold_names_map[col]] = gold[col].cat.categories
    #         gold[col] = gold[col].cat.codes
    merged = merge_with_spans(pred,
                              gold,
                              on=on,
                              how='inner',
                              span_policy=span_policy,
                              suffixes=suffixes)

    overlap_size_names = [
        c for c in merged.columns if c.startswith("overlap_size_")
    ]
    merged = merged.groupby(
        [atom_pred_level, atom_gold_level], as_index=False, observed=True
    ).agg({
        **{n: 'sum'
           for n in overlap_size_names},
        **{
            n: 'first'
            for n in merged.columns if n not in (*overlap_size_names, atom_pred_level, atom_gold_level)
        }
    })
    if overlap_size_names:
        merged = merged.sort_values(overlap_size_names)
    res = None
    if not len(merged):
        res = merged.iloc[:0]
    while len(merged):
        tmp = merged
        tmp = tmp.groupby(atom_gold_level, as_index=False,
                          observed=True).last()
        tmp = tmp.groupby(atom_pred_level, as_index=False,
                          observed=True).last()
        res = res.append(tmp) if res is not None else tmp
        merged = merged[np.logical_and(
            ~merged[atom_pred_level].isin(res[atom_pred_level]),
            ~merged[atom_gold_level].isin(res[atom_gold_level]))]

    pred = pred.groupby([atom_pred_level], as_index=False,
                        observed=True).last()
    gold = gold.groupby([atom_gold_level], as_index=False,
                        observed=True).last()
    res = pd.concat(
        (res, pred[~pred[atom_pred_level].isin(res[atom_pred_level])][list(
            set(res.columns) & set(pred.columns))],
         gold[~gold[atom_gold_level].isin(res[atom_gold_level])][list(
             set(res.columns) & set(gold.columns))]),
        sort=False)
    # for col, categories in categoricals.items():
    #     res[col] = pd.Categorical.from_codes(res[col].fillna(-1).astype(int), categories=categories)

    res['pred_count'] = (~res[atom_pred_level].isnull()).astype(int)
    res['gold_count'] = (~res[atom_gold_level].isnull()).astype(int)
    res['tp'] = res['pred_count'] * res['gold_count']
    res['root'] = 0

    res = res.drop(
        columns=(([atom_pred_level] if delete_atom_pred_level else []) +
                 ([atom_gold_level] if delete_atom_gold_level else [])))

    return res
Ejemplo n.º 6
0
def partition_spans(smalls,
                    large,
                    overlap_policy="merge_large",
                    new_id_name="sample_id",
                    span_policy="partial_strict"):
    """

    Parameters
    ----------
    smalls: pd.DataFrame[begin, end, ...]
        Ex: tokens
    large: pd.DataFrame[begin, end, ...]
        Ex: sentences
    overlap_policy: str or bool
        One of
        - merge_large:
            Keeps small untouched but merges large spans that overlap the same small span
            ex: partition_spans(mentions, sentences) -> merges sentences
        - small_to_leftmost_large:
            Keeps small and large untouched, and assigns small to the leftmost large that overlaps it
            ex: partition_spans(tokens, mentions) -> assign token to the leftmost mention that touches it
        - small_to_rightmost_large:
            Keeps small and large untouched, and assigns small to the rightmost large that overlaps it
            ex: partition_spans(tokens, mentions) -> assign token to the rightmost mention that touches it
        - small_to_biggest_overlap_large:
            keeps small and large untouched, and assigns small to the large span that overlaps it the most
            ex: partition_spans(tokens, mentions) -> assign token to the mention that overlaps it the most
        - False
            do nothing and allow multiple matchings between small and large
    new_id_name: str
        If overlap_policy == "merge_large", this is the column that will host the newly created ids per merge
    span_policy:
        Which policy to use to detect span overlaps

    Returns
    -------

    """

    assert overlap_policy in (
        "merge_large", "split_small", "small_to_leftmost_large",
        "small_to_rightmost_large", "small_to_biggest_overlap_large",
        False), f"Unknown small overlap policy '{overlap_policy}'"

    if not isinstance(smalls, (list, tuple)):
        smalls = [smalls]

    merged_id_cols = doc_id_cols = None
    if overlap_policy == "merge_large":
        original_new_id_name = new_id_name
        while new_id_name in large.columns:
            new_id_name = "_" + new_id_name
        large = large.copy()
        old_to_new = None
        has_created_new_id_col = False
        for small in smalls:
            doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids(
                large, small)
            large_id_cols = [c for c in large_id_cols]
            # Merge sentences and mentions
            merged = merge_with_spans(small,
                                      large,
                                      span_policy=span_policy,
                                      how='right',
                                      on=[*doc_id_cols, ("begin", "end")])
            # If a mention overlap multiple sentences, assign it to the last sentence
            small_ids = merged[doc_id_cols + small_id_cols].nlstruct.factorize(
                group_nans=False)
            if has_created_new_id_col:
                large_ids = merged[doc_id_cols +
                                   [new_id_name]].nlstruct.factorize(
                                       group_nans=False)
            else:
                large_ids = merged[doc_id_cols +
                                   large_id_cols].nlstruct.factorize(
                                       group_nans=False)
            merged[new_id_name] = make_id_from_merged(large_ids,
                                                      small_ids,
                                                      apply_on=[(0, large_ids)
                                                                ])[0]
            merged["begin"] = merged[['begin_x', 'begin_y']].min(axis=1)
            merged["end"] = merged[['end_x', 'end_y']].max(axis=1)
            large = (merged.groupby(
                new_id_name, as_index=False, observed=True
            ).agg({
                **{
                    n: 'first'
                    for n in [*doc_id_cols, *large_id_cols] if n != new_id_name
                }, 'begin': 'min',
                'end': 'max'
            }).astype({
                "begin": int,
                "end": int,
                **large[doc_id_cols].dtypes
            }))
            large = large[doc_id_cols + [new_id_name] + ["begin", "end"]]
            large[new_id_name] = large['begin']
            large = large.nlstruct.groupby_assign(
                doc_id_cols,
                {new_id_name: lambda x: tuple(np.argsort(np.argsort(x)))})
            old_to_new = large[doc_id_cols +
                               [new_id_name]].drop_duplicates().reset_index(
                                   drop=True)
            merged_id_cols = [new_id_name]
        # large[original_new_id_name] = large[doc_id_cols + [new_id_name]].apply(lambda x: "/".join(map(str, x[doc_id_cols])) + "/" + str(x[new_id_name]), axis=1).astype("category")
        # large = large.drop(columns={*doc_id_cols, new_id_name} - {original_new_id_name})
    else:
        original_new_id_name = None
        # merged = merged.drop_duplicates([*doc_id_cols, *small_id_cols], keep=overlap_policy)
        doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids(
            large, smalls[0])
        merged_id_cols = large_id_cols
        new_id_name = None
        old_to_new = None

    # Merge sentences and mentions
    new_smalls = []
    for small in smalls:
        doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids(
            large, small)
        merged = merge_with_spans(small,
                                  large[doc_id_cols + large_id_cols +
                                        ['begin', 'end']],
                                  how='inner',
                                  span_policy=span_policy,
                                  on=[*doc_id_cols, ("begin", "end")])

        if overlap_policy == "small_to_biggest_overlap_large":
            merged = merged.sort_values([
                *doc_id_cols, *small_id_cols, 'overlap_size_0'
            ]).drop_duplicates([*doc_id_cols, *small_id_cols], keep="last")
        elif overlap_policy == "small_to_leftmost_large":
            merged = merged.sort_values(
                [*doc_id_cols, *small_id_cols,
                 'begin_y']).drop_duplicates([*doc_id_cols, *small_id_cols],
                                             keep="first")
        elif overlap_policy == "small_to_rightmost_large":
            merged = merged.sort_values(
                [*doc_id_cols, *small_id_cols,
                 'begin_y']).drop_duplicates([*doc_id_cols, *small_id_cols],
                                             keep="last")
        elif overlap_policy == "split_small":
            merged = merged.assign(begin_x=np.maximum(merged['begin_x'],
                                                      merged['begin_y']),
                                   end_x=np.minimum(merged['end_x'],
                                                    merged['end_y']))
        new_small = (merged.assign(
            begin=merged["begin_x"] - merged["begin_y"],
            end=merged["end_x"] - merged["begin_y"]).astype({
                "begin": int,
                "end": int
            })[[
                *doc_id_cols, *(merged_id_cols or ()), *small_id_cols,
                *small_val_cols, "begin", "end"
            ]])
        if new_id_name:
            new_small[new_id_name] = new_small[new_id_name].astype(str)
            new_small[new_id_name] = new_small[new_id_name].str.zfill(
                new_small[new_id_name].str.len().max())
            new_small[original_new_id_name] = join_cols(
                new_small[doc_id_cols + (
                    [new_id_name] if new_id_name not in doc_id_cols else [])],
                "/")
            new_small = new_small.drop(columns={*doc_id_cols, new_id_name} -
                                       {original_new_id_name})

        new_smalls.append(new_small)

    if original_new_id_name:
        if new_id_name:
            large[new_id_name] = large[new_id_name].astype(str)
            large[new_id_name] = large[new_id_name].str.zfill(
                large[new_id_name].str.len().max())
            large[original_new_id_name] = join_cols(
                large[doc_id_cols + [new_id_name]], "/")
            large = large.drop(columns={*doc_id_cols, new_id_name} -
                               {original_new_id_name})
            new_doc_id_cols = [
                c if c != original_new_id_name else f'_{c}'
                for c in doc_id_cols
            ]

            old_to_new[new_id_name] = old_to_new[new_id_name].astype(str)
            old_to_new[new_id_name] = old_to_new[new_id_name].str.zfill(
                old_to_new[new_id_name].str.len().max())
            (
                old_to_new[original_new_id_name],
                old_to_new[new_doc_id_cols],
            ) = (
                # old_to_new[doc_id_cols + [new_id_name]].apply(lambda x: "/".join(map(str, x[doc_id_cols])) + "/" + str(x[new_id_name]), axis=1),
                join_cols(old_to_new[doc_id_cols + [new_id_name]], "/"),
                old_to_new[doc_id_cols])
            if new_id_name not in (*new_doc_id_cols, original_new_id_name):
                del old_to_new[new_id_name]
        new_smalls = [
            small.astype(
                {original_new_id_name: large[original_new_id_name].dtype})
            for small in new_smalls
        ]
    return new_smalls, large, old_to_new
Ejemplo n.º 7
0
def encode_as_tag(small,
                  large,
                  label_cols=None,
                  tag_names=None,
                  tag_scheme="bio",
                  use_token_idx=False,
                  verbose=0,
                  groupby=None):
    """

    Parameters
    ----------
    tag_names: str or list of str
        tag name that will be created for each label
    tag_scheme: str
        BIO/BIOUL tagging scheme
    small: tokens
    large: mentions
    label_cols: "label"
    use_token_idx: Use token pos instead of char spans, defaults to False
    verbose: int
        If verbose > 0, make progress bar

    Returns
    -------
    pd.DataFrame
    """
    assert tag_scheme in ("bio", "bioul", "raw")

    doc_id_cols, small_id_cols, large_id_cols, small_val_cols, large_val_cols = preprocess_ids(
        large, small)
    # assert len(large_val_cols) < 2, "Cannot encode more than one column as tags"
    assert len(large_val_cols) > 0, "Must have a column to encode as tags"
    if label_cols is None:
        label_cols = large_val_cols
    if isinstance(label_cols, str):
        label_cols = [label_cols]
    if tag_names is None:
        tag_names = label_cols
    if isinstance(tag_names, str):
        tag_names = [tag_names]

    label_categories = {}
    # Map mentions to small as a tag
    large = large.sort_values([*doc_id_cols, "begin", "end"])
    for label, mentions_of_group in (large.groupby(
            groupby, as_index=False, observed=True) if groupby is not None else
                                     [(None, large)]):
        assert label not in large_val_cols, f"Cannot groupby {label} value because there is already a column with this name"
        group_tag_names = [
            "/".join(s for s in (label, tag_name) if s is not None)
            for tag_name in tag_names
        ]
        if use_token_idx:
            merged = merge_with_spans(
                mentions_of_group,
                small[[
                    *doc_id_cols, *small_id_cols,
                    *(c for c in small_val_cols if c != "token_idx"),
                    "token_idx"
                ]],
                on=doc_id_cols,
                suffixes=('_large',
                          '')).query("begin <= token_idx and token_idx < end")
        else:
            merged = merge_with_spans(mentions_of_group,
                                      small,
                                      span_policy='partial_strict',
                                      on=[*doc_id_cols, ("begin", "end")],
                                      suffixes=('_large', ''))

        # If a token overlap multiple mentions, assign it to the last mention
        len_before = len(merged)
        merged = merged.drop_duplicates([*doc_id_cols, *small_id_cols],
                                        keep='last')
        if len_before - len(merged) > 0:
            warn(
                f"Dropped {len_before-len(merged)} duplicated tags caused by overlapping mentions"
            )
        merged_id_cols = doc_id_cols + large_id_cols + small_id_cols

        # Encode mention labels as a tag
        tags = (merged[merged_id_cols +
                       label_cols].sort_values(merged_id_cols))
        if tag_scheme != "raw":
            keep_cols = list(
                set(doc_id_cols + large_id_cols) - set(label_cols))
            tags = (
                # convert all categorical dtypes of group cols as simple types (np.str, np.int, np.object...)
                # to accelerate concatenation inside the groupby
                tags.astype({
                    k: dtype if not hasattr(dtype, 'categories') else
                    dtype.categories.dtype
                    for k, dtype in tags.dtypes[keep_cols].items()
                }).rename(dict(zip(label_cols, group_tag_names)),
                          axis=1).nlstruct.groupby_assign(
                              doc_id_cols + large_id_cols, {
                                  tag_name: lambda labels: make_tag_scheme(
                                      len(labels), labels.iloc[0], tag_scheme)
                                  for tag_name, label_col in zip(
                                      group_tag_names, label_cols)
                              })
                # convert back each group column dtype to its origial categorical dtype
                .astype(tags.dtypes[keep_cols])
                [doc_id_cols + small_id_cols + group_tag_names])

        # merged = merged[[*merged_id_cols, *small_val_cols, "begin", "end"]].merge(tags)
        small = small.merge(tags, on=doc_id_cols + small_id_cols, how="left")
        if tag_scheme != "raw":
            try:
                for tag_name, label_col in zip(group_tag_names, label_cols):
                    unique_labels = sorted(set(label for label in mentions_of_group[label_col] if label is not None))\
                        if not hasattr(mentions_of_group[label_col], 'cat') else mentions_of_group[label_col].cat.categories
                    label_categories[tag_name] = unique_labels
                    small[tag_name] = small[tag_name].fillna("O").astype(
                        pd.CategoricalDtype([
                            "O", *(tag for label in unique_labels
                                   for tag in ("B-" + str(label),
                                               "I-" + str(label)))
                        ] if tag_scheme == "bio" else [
                            "O", *(tag for label in unique_labels
                                   for tag in ("B-" + str(label), "I-" +
                                               str(label), "L-" + str(label),
                                               "U-" + str(label)))
                        ]))
            except Exception:
                raise Exception(
                    f"Error occured during the encoding of label column '{label_col}' into tag '{tag_name}'"
                )
    # return small[doc_id_cols + small_id_cols].merge(merged, how='left')
    return small, label_categories