Example #1
0
def _make_syntax_dataframes(syntax_response, original_text):
    tokens = syntax_response.get("tokens", [])
    sentence = syntax_response.get("sentences", [])

    if len(tokens) > 0:
        token_table = util.make_table(tokens)
        location_col, location_name = util.find_column(token_table, "location")
        text_col, text_name = util.find_column(token_table, "text")
        char_span = util.make_char_span(location_col, text_col, original_text)

        # Drop location, text columns that is duplicated in char_span
        token_table = token_table.drop([location_name, text_name])

        # Add the span columns to the DataFrames
        token_df = token_table.to_pandas()
        token_df['span'] = char_span
    else:
        char_span = None
        token_df = pd.DataFrame()

    if len(sentence) > 0:
        sentence_table = util.make_table(sentence)
        sentence_df = sentence_table.to_pandas()
        if char_span is not None:
            location_col, _ = util.find_column(sentence_table, "location")
            text_col, _ = util.find_column(sentence_table, "text")
            sentence_char_span = util.make_char_span(location_col, text_col,
                                                     original_text)
            sentence_span = TokenSpanArray.align_to_tokens(
                char_span, sentence_char_span)
            sentence_df['span'] = sentence_char_span
            sentence_df['sentence_span'] = sentence_span
    else:
        sentence_df = pd.DataFrame()

    return token_df, sentence_df
Example #2
0
def _make_entity_dataframes(
        entities: List, original_text: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Create the entities and entity_mentions DataFrames.

    :param entities: The "entities" section of a parsed NLU response
    :param original_text: Text of the document.  This argument must be provided if there
     are entity mention spans.
    """
    if len(entities) == 0:
        return pd.DataFrame(), pd.DataFrame()

    table = util.make_table(entities)

    # Check if response includes entity mentions
    mention_name_cols = [(name, table.column(name))
                         for name in table.column_names
                         if name.lower().startswith("mentions")]

    # Make entities and entity mentions (optional) DataFrames
    if len(mention_name_cols) > 0:
        mention_names, mention_cols = zip(*mention_name_cols)

        # Create the entities DataFrame with mention arrays dropped
        table = table.drop(mention_names)
        pdf = table.to_pandas()

        # Flatten the mention arrays to be put in separate table
        mention_arrays = [
            pa.concat_arrays(col.iterchunks()) for col in mention_cols
        ]
        flat_mention_arrays = [a.flatten() for a in mention_arrays]
        table_mentions = pa.Table.from_arrays(flat_mention_arrays,
                                              names=mention_names)

        # Convert location/text columns to span
        location_col, location_name = util.find_column(table_mentions,
                                                       "location")
        text_col, text_name = util.find_column(table_mentions, "text")
        if original_text is None:
            raise ValueError(
                "Unable to construct target text for converting entity mentions to spans"
            )

        char_span = util.make_char_span(location_col, text_col, original_text)
        table_mentions = table_mentions.drop([location_name, text_name])

        # Create the entity_mentions DataFrame
        pdf_mentions = table_mentions.to_pandas()
        pdf_mentions["span"] = char_span

        # Align index of parent entities DataFrame with flattened DataFrame and ffill
        # values
        mention_offsets = mention_arrays[0].offsets.to_numpy()
        pdf_parent = pdf.set_index(mention_offsets[:-1])
        pdf_parent = pdf_parent.reindex(pdf_mentions.index, method="ffill")

        # Add columns from entities parent DataFrame
        pdf_mentions["text"] = pdf_parent["text"]
        pdf_mentions["type"] = pdf_parent["type"]

        # Remove "mentions" from column names
        pdf_mentions.rename(columns={
            c: c.split("mentions.")[-1]
            for c in pdf_mentions.columns
        },
                            inplace=True)
    else:
        pdf = table.to_pandas()
        pdf_mentions = pd.DataFrame()

    return pdf, pdf_mentions
Example #3
0
def _make_relations_dataframe_zero_copy(relations):
    if len(relations) == 0:
        return pd.DataFrame()

    table = util.make_table(relations)

    # Separate each argument into a column
    flattened_arguments = []
    drop_cols = []
    for name in table.column_names:
        if name.lower().startswith("arguments"):
            col = pa.concat_arrays(table.column(name).iterchunks())
            assert pa.types.is_list(col.type)
            is_nested_list = pa.types.is_list(col.type.value_type)

            name_split = name.split('.', maxsplit=1)
            first_list = col[0]
            num_arguments = len(first_list)

            null_count = 0

            # Get the flattened raw values
            raw = col
            offset_arrays = []
            while pa.types.is_list(raw.type):
                offset_arrays.append(raw.offsets)
                null_count += raw.null_count
                raw = raw.flatten()

            # TODO handle lists with null values
            if null_count > 0:
                continue

            # Convert values to numpy
            values = raw.to_numpy(zero_copy_only=False)  # string might copy
            offsets_list = [o.to_numpy() for o in offset_arrays]

            # Compute the length of each list in the array
            value_offsets = offsets_list.pop()
            value_lengths = value_offsets[1:] - value_offsets[:-1]

            # Separate the arguments into individual columns
            for i in range(num_arguments):
                arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1])
                arg_lengths = value_lengths[i::num_arguments]

                # Fixed length arrays can be sliced
                if not is_nested_list or len(np.unique(arg_lengths)) == 1:
                    num_elements = len(first_list[i]) if is_nested_list else 1

                    # Only 1 element so leave in primitive array
                    if not is_nested_list or num_elements == 1:
                        arg_values = values[i::num_arguments]
                        arg_array = pa.array(arg_values)
                    # Multiple elements so put back in a list array
                    else:
                        arg_values = values.reshape(
                            [len(col) * num_arguments, num_elements])
                        arg_values = arg_values[i::num_elements]
                        arg_values = arg_values.flatten()
                        arg_offsets = np.cumsum(arg_lengths)
                        arg_offsets = np.insert(arg_offsets, 0, 0)
                        arg_array = pa.ListArray.from_arrays(
                            arg_offsets, arg_values)
                else:
                    # TODO Argument properties with variable length arrays not currently
                    #  supported
                    continue

                flattened_arguments.append((arg_array, arg_name))
            drop_cols.append(name)

    # Add the flattened argument columns
    for arg_array, arg_name in flattened_arguments:
        table = table.append_column(arg_name, arg_array)

    # Drop columns that have been flattened
    table = table.drop(drop_cols)

    return table.to_pandas()
Example #4
0
def _make_relations_dataframe(relations, original_text, sentence_span_series):
    if len(relations) == 0:
        return pd.DataFrame()

    table = util.make_table(relations)

    location_cols = {
    }  # Type: Dict[int, Tuple[Union[Array, ChunkedArray], str]]

    # Separate each argument into a column
    flattened_arguments = []
    drop_cols = []
    for name in table.column_names:
        if name.lower().startswith("arguments"):
            col = pa.concat_arrays(table.column(name).iterchunks())
            assert pa.types.is_list(col.type)

            name_split = name.split('.', maxsplit=1)
            num_arguments = len(col[0])

            value_series = col.values.to_pandas()

            # Separate the arguments into individual columns
            for i in range(num_arguments):
                arg_name = "{}.{}.{}".format(name_split[0], i, name_split[1])
                arg_series = value_series[i::num_arguments]

                arg_array = pa.array(arg_series)

                # If list array is fixed length with 1 element, it can be flattened
                temp = arg_array
                while pa.types.is_list(temp.type):
                    temp = temp.flatten()
                    if len(temp) == len(arg_array):
                        # TODO also need to verify each offset inc by 1?
                        arg_array = temp

                if name.lower().endswith("location"):
                    location_cols[i] = (arg_array,
                                        "{}.{}".format(name_split[0], i))

                flattened_arguments.append((arg_array, arg_name))
            drop_cols.append(name)

    # Add the flattened argument columns
    for arg_array, arg_name in flattened_arguments:
        table = table.append_column(arg_name, arg_array)

    # Replace argument location and text columns with spans
    arg_span_cols = {}
    for arg_i, (location_col, arg_prefix) in location_cols.items():
        text_col, text_name = util.find_column(table,
                                               "{}.text".format(arg_prefix))
        arg_span_cols["{}.span".format(arg_prefix)] = util.make_char_span(
            location_col, text_col, original_text)
        drop_cols.extend(["{}.location".format(arg_prefix), text_name])

    add_cols = arg_span_cols.copy()

    # Build the sentence span and drop plain text sentence col
    sentence_col, sentence_name = util.find_column(table, "sentence")
    arg_col_names = list(arg_span_cols.keys())
    if len(arg_col_names) > 0:
        first_arg_span_array = arg_span_cols[arg_col_names[0]]

        sentence_matches = []
        for i, arg_span in enumerate(first_arg_span_array):
            arg_begin = arg_span.begin
            arg_end = arg_span.end
            j = len(sentence_span_series) // 2
            found = False
            while not found:
                sentence_span = sentence_span_series[j]
                if arg_begin >= sentence_span.end:
                    j += 1
                elif arg_end <= sentence_span.begin:
                    j -= 1
                else:
                    contains = [
                        sentence_span.contains(a[i])
                        for a in arg_span_cols.values()
                    ]
                    if not (all(contains) and sentence_span.covered_text
                            == sentence_col[i].as_py()):
                        msg = f"Mismatched sentence span for: {sentence_span}"
                        if not all(contains):
                            msg += f"\nContains Args: {all(contains)}"
                        if sentence_span.covered_text != sentence_col[i].as_py(
                        ):
                            msg += f"\nSpanText: '{sentence_span.covered_text}'" \
                                   f"\nSentence: '{sentence_col[i]}'"
                        warnings.warn(msg)
                    sentence_matches.append(j)
                    found = True

        relations_sentence = sentence_span_series[sentence_matches]
        add_cols["sentence_span"] = relations_sentence.reset_index(drop=True)
        drop_cols.append(sentence_name)
    else:
        warnings.warn("Could not make sentence span column for Re")

    # Drop columns that have been flattened or replaced by spans
    table = table.drop(drop_cols)

    df = table.to_pandas()

    # Insert additional columns
    for col_name, col in add_cols.items():
        df[col_name] = col

    return df