def __init__(self, tokens: Union[SpanArray, Sequence[SpanArray]], begin_tokens: Union[pd.Series, np.ndarray, Sequence[int]] = None, end_tokens: Union[pd.Series, np.ndarray, Sequence[int]] = None): """ :param tokens: Character-level span information about the underlying tokens. Can be a single set of tokens, covering all spans, or a separate `SpanArray` pointer for every span. :param begin_tokens: Array of begin offsets measured in tokens :param end_tokens: Array of end offsets measured in tokens """ # Superclass constructor expects values for things that the subclass doesn't # use. super().__init__(_NOT_A_DOCUMENT_TEXT, _EMPTY_INT_ARRAY, _EMPTY_INT_ARRAY) if not isinstance(begin_tokens, (pd.Series, np.ndarray, list)): raise TypeError(f"begin_tokens is of unsupported type {type(begin_tokens)}. " f"Supported types are Series, ndarray and List[int].") if not isinstance(end_tokens, (pd.Series, np.ndarray, list)): raise TypeError(f"end_tokens is of unsupported type {type(end_tokens)}. " f"Supported types are Series, ndarray and List[int].") if isinstance(tokens, SpanArray): if not tokens.is_single_document: raise ValueError(f"Token spans come from more than one document.") # Can't just pass a SpanArray to np.full() or np.array(), because Numpy will # interpret it as an array-like of Span values. tokens_array = np.empty(len(begin_tokens), dtype=object) for i in range(len(begin_tokens)): tokens_array[i] = tokens tokens = tokens_array elif isinstance(tokens, collections.abc.Sequence): if len(tokens) != len(begin_tokens): raise ValueError(f"Received {len(tokens)} arrays of tokens and " f"{len(begin_tokens)} begin offsets. " f"Lengths should be equal.") # Can't just pass a SpanArray to np.array(), because Numpy will interpret it # as an array-like of Span values. tokens_array = np.empty(len(begin_tokens), dtype=object) for i in range(len(begin_tokens)): tokens_array[i] = tokens[i] tokens = tokens_array elif isinstance(tokens, np.ndarray): if len(tokens) != len(begin_tokens): raise ValueError(f"Received {len(tokens)} arrays of tokens and " f"{len(begin_tokens)} begin offsets. " f"Lengths should be equal.") if (len(tokens) > 0 and tokens[0] is not None and not isinstance(tokens[0], SpanArray)): raise TypeError(f"Tokens object for row 0 is of unexpected type " f"{type(tokens[0])}. Type should be SpanArray.") else: raise TypeError(f"Expected SpanArray or list of SpanArray as tokens " f"but got {type(tokens)}") self._tokens = tokens self._begin_tokens = to_int_array(begin_tokens) self._end_tokens = to_int_array(end_tokens)
def __init__(self, text: Union[str, Sequence[str], np.ndarray, Tuple[StringTable, np.ndarray]], begins: Union[pd.Series, np.ndarray, Sequence[int]], ends: Union[pd.Series, np.ndarray, Sequence[int]]): """ Factory method for creating instances of this class. :param text: Target text from which the spans of this array are drawn, or a sequence of texts if different spans can have different targets :param begins: Begin offsets of spans (closed) :param ends: End offsets (open) :return: A new `SpanArray` object """ if not isinstance(begins, (pd.Series, np.ndarray, list)): raise TypeError( f"begins is of unsupported type {type(begins)}. " f"Supported types are Series, ndarray and List[int].") if not isinstance(ends, (pd.Series, np.ndarray, list)): raise TypeError( f"ends is of unsupported type {type(ends)}. " f"Supported types are Series, ndarray and List[int].") if len(begins) != len(ends): raise ValueError( f"Received {len(begins)} begin offsets and {len(ends)} " f"offsets. Lengths should be equal.") begins = to_int_array(begins) ends = to_int_array(ends) if isinstance(text, str): # With a single string, every row gets string ID 0 string_table = StringTable.create_single(text) # type: StringTable text_ids = np.zeros_like(begins) # type: np.ndarray elif isinstance(text, tuple): # INTERNAL USE ONLY: String table specified directly. # Note that this branch MUST come before the branch that checks for # sequences of strings, because tuples are sequences. string_table, text_ids = text elif isinstance(text, (collections.abc.Sequence, np.ndarray)): if len(text) != len( begins): # Checked len(begins) == len(ends) earlier raise ValueError( f"Received {len(text)} target text values and " f"{len(begins)} begin offsets. Lengths should be equal.") string_table, text_ids = StringTable.merge_things(text) else: raise TypeError( f"Text argument is of unsupported type {type(text)}") # Begin and end offsets in characters self._begins = begins # type: np.ndarray self._ends = ends # type: np.ndarray self._string_table = string_table # type: Union[StringTable, None] self._text_ids = text_ids # Cached list of other SpanArrays that are exactly the same as this # one. Each element is the result of calling id() self._equivalent_arrays = [] # type: List[int] # Version numbers of elements in self._equivalent_arrays, to ensure that # a change hasn't made the arrays no longer equal self._equiv_array_versions = [] # type: List[int] # Monotonically increasing version number for tracking changes and # invalidating caches self._version = 0 # Flag that tells whether to display details of offsets in Jupyter notebooks self._repr_html_show_offsets = True # type: bool