def length(batch: DataPanel, columns: list): try: # Take advantage of previously stored Spacy information return [ len(doc) for doc in lookup(batch, SpacyOp, columns) ] except AttributeError: # If unavailable, fall back to splitting text return [len(text.split()) for text in batch[columns[0]]]
def fn(): nonlocal dp from robustnessgym import lookup dp = op(dp=dp, columns=["question"]) # Look it up when you need it capitalized_text = lookup(dp, op, ["question"]) return capitalized_text
def stanza_example(dp): columns = [0.45, 0.05, 0.50] st.header("Run Stanza Workflow") from robustnessgym import lookup from robustnessgym.ops import StanzaOp # Run the Stanza pipeline on the 'question' column of the dataset stanza = StanzaOp() dp = stanza(dp=dp, columns=["question"]) # adds a new column that is auto-named "StanzaOp(columns=['question'])" # Grab the Stanza column from the DataPanel using the lookup stanza_column = lookup(dp, stanza, ["question"]) format_code( """ from robustnessgym import lookup from robustnessgym.ops import StanzaOp # Run the Stanza pipeline on the 'question' column of the dataset stanza = StanzaOp() dp = stanza(dp=dp, columns=['question']) # adds a new column that is auto-named "StanzaOp(columns=['question'])" # Grab the Stanza column from the DataPanel using the lookup stanza_column = lookup(dp, stanza, ['question']) """, stanza_column._repr_pandas_(), columns=columns, ) st.subheader("Columns contain Cells") format_code( """ cell = stanza_column[0] cell """, stanza_column[0], columns=columns, ) st.subheader("Cells can be treated like stanza objects") format_code( """ cell = stanza_column[0] cell.text """, stanza_column[0].text, columns=columns, ) format_code( """ cell = stanza_column[0] cell.entities """, stanza_column[0].entities, columns=columns, )
def test_apply(self): # Create the Spacy cached operation spacy = SpacyOp() # Apply it dataset = spacy(self.testbed.dataset, ["text"]) print(dataset.column_names) # Retrieve information to test sentences = [doc.sents for doc in lookup(dataset, spacy, ["text"])] tokens = [list(doc) for doc in lookup(dataset, spacy, ["text"])] entities = [doc.ents for doc in lookup(dataset, spacy, ["text"])] num_tokens = [len(list(doc)) for doc in lookup(dataset, spacy, ["text"])] self.assertEqual( sentences, [ ["The man is walking."], ["The man is running."], ["The woman is sprinting."], ["The woman is resting."], ["The hobbit is flying."], ["The hobbit is swimming."], ], ) self.assertEqual( tokens, [ ["The", "man", "is", "walking", "."], ["The", "man", "is", "running", "."], ["The", "woman", "is", "sprinting", "."], ["The", "woman", "is", "resting", "."], ["The", "hobbit", "is", "flying", "."], ["The", "hobbit", "is", "swimming", "."], ], ) self.assertEqual(entities, [[], [], [], [], [], []]) self.assertEqual(num_tokens, [5, 5, 5, 5, 5, 5])
def fn(): nonlocal dp from robustnessgym import lookup from robustnessgym.ops import LazyTextBlobOp # Run the TextBlob pipeline on the 'passage' column of the dataset textblob = LazyTextBlobOp() dp = textblob(dp=dp, columns=["question"]) # adds a new column that is auto-named "LazyTextBlobOp(columns=['question'])" # Grab the TextBlob column from the DataPanel using the lookup textblob_column = lookup(dp, textblob, ["question"]) return textblob_column
def fn(): nonlocal dp, op from robustnessgym import lookup upper_text = lookup(dp, op, ["question"], "upper") return upper_text
def fn(): nonlocal dp, op from robustnessgym import lookup capitalized_text = lookup(dp, op, ["question"], "capitalize") return capitalized_text
def spacy_example(dp): columns = [0.45, 0.05, 0.50] with st.beta_container(): st.header("Run Operation: `SpacyOp`") st.write(""" spaCy is a popular text processing library that provides tokenization, tagging and other capabilities. """) from robustnessgym import lookup from robustnessgym.ops import SpacyOp # Run the Spacy pipeline on the 'question' column of the dataset spacy = SpacyOp() dp = spacy(dp=dp, columns=["passage"]) # adds a new column that is auto-named # "SpacyOp(lang=en_core_web_sm, neuralcoref=False, columns=['passage'])" format_code( """ from robustnessgym import lookup from robustnessgym.ops import SpacyOp # Run the Spacy pipeline on the 'question' column of the dataset spacy = SpacyOp() dp = spacy(dp=dp, columns=['passage']) # adds a new column that is auto-named # "SpacyOp(lang=en_core_web_sm, neuralcoref=False, columns=['passage'])" """, dp.streamlit(), columns=columns, ) st.write("------") with st.beta_container(): spacy_column = lookup(dp, spacy, ["passage"]) format_code( """ lookup(dp, spacy, ['passage']) """, spacy_column._repr_pandas_(), columns=columns, ) with st.beta_container(): st.subheader("Columns contain Cells") cell = spacy_column[1] format_code( """ cell = spacy_column[1] cell """, cell, columns=columns, ) format_code( """ list(cell) """, list(cell), columns=columns, ) format_code( """ cell.ents """, cell.ents, columns=columns, )