def run_workflow(jsonl_path: str, doc_column: str = None, reference_column: str = None, summary_columns: List[str] = None, bert_aligner_threshold: float = 0.5, bert_aligner_top_k: int = 3, embedding_aligner_threshold: float = 0.5, embedding_aligner_top_k: int = 3, processed_dataset_path: str = None, n_samples: int = None): if not jsonl_path: raise ValueError("'jsonl_path' is required") if not processed_dataset_path: raise ValueError("Please specify a path to save the dataset.") # Load the dataset dataset = DataPanel.from_jsonl(jsonl_path) if doc_column is None: # Assume `doc_column` is called "document" doc_column = 'document' assert doc_column in dataset.columns, \ f"`doc_column={doc_column}` is not a column in datapanel." print("Assuming `doc_column` is called 'document'.") if reference_column is None: # Assume `reference_column` is called "summary:reference" reference_column = 'summary:reference' print("Assuming `reference_column` is called 'summary:reference'.") if reference_column not in dataset.columns: print("No reference summary loaded") reference_column = None if summary_columns is None or len(summary_columns) == 0: # Assume `summary_columns` are prefixed by "summary:" summary_columns = [] for col in dataset.columns: if col.startswith("summary:") and col != "summary:reference": summary_columns.append(col) print( f"Reading summary columns from datapanel. Found {summary_columns}." ) if len(summary_columns) == 0 and reference_column is None: raise ValueError("At least one summary is required") # Restrict to the first `n_samples` if n_samples: print(f"Restricting to {n_samples} samples.") dataset = dataset.head(n_samples) print("size of dataset:", len(dataset)) # Combine the text columns into one list text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns # Preprocessing all the text columns print("Preprocessing text columns") dataset = dataset.update( lambda x: { f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k]) for k in text_columns }) # Run the Spacy pipeline on all preprocessed text columns nlp = load_nlp() nlp.add_pipe('sentencizer', before="parser") print("Running spacy processing") for col in text_columns: dataset.add_column( f'spacy:{col}', SpacyColumn.from_docs(nlp.pipe(dataset[f'preprocessed_{col}']))) # Run the 3 align pipelines bert_aligner = BertscoreAligner( threshold=bert_aligner_threshold, top_k=bert_aligner_top_k, ) embedding_aligner = StaticEmbeddingAligner( threshold=embedding_aligner_threshold, top_k=embedding_aligner_top_k, ) ngram_aligner = NGramAligner() dataset = _run_aligners( dataset=dataset, aligners=[bert_aligner, embedding_aligner, ngram_aligner], doc_column=f'spacy:{doc_column}', reference_column=f'spacy:{reference_column}' if reference_column else None, summary_columns=[f'spacy:{col}' for col in summary_columns], ) # Save the dataset dataset.write(processed_dataset_path) return dataset
def __init__( self, spacy, ): super(NGramAlignerCap, self).__init__(aligner=NGramAligner(), spacy=spacy)
def show_main(example): # Get user input semantic_sim_type = st.sidebar.radio( "Semantic similarity type:", ["Contextual embedding", "Static embedding"]) semantic_sim_threshold = st.sidebar.slider( "Semantic similarity threshold:", min_value=MIN_SEMANTIC_SIM_THRESHOLD, max_value=1.0, step=0.1, value=0.2, ) semantic_sim_top_k = st.sidebar.slider( "Semantic similarity top-k:", min_value=1, max_value=MAX_SEMANTIC_SIM_TOP_K, step=1, value=10, ) document, summaries = select_comparison(example) layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower() # if layout == "horizontal": # scroll = st.sidebar.checkbox(label="Scroll sections", value=True) # else: scroll = True grey_stopwords = st.sidebar.checkbox(label="Grey out stopwords", value=True) # Gather data try: lexical_alignments = [ NGramAlignerCap.decode(example.data[Identifier( NGramAlignerCap.__name__)(columns=[ f'preprocessed_{document._.column}', f'preprocessed_{summary._.column}', ])])[0] for summary in summaries ] lexical_alignments = [{ k: [(pair[0], int(pair[1])) for pair in v] for k, v in d.items() } for d in lexical_alignments] except KeyError: lexical_alignments = NGramAligner().align(document, summaries) if semantic_sim_type == "Static embedding": try: semantic_alignments = [ StaticEmbeddingAlignerCap.decode(example.data[Identifier( StaticEmbeddingAlignerCap.__name__)( threshold=MIN_SEMANTIC_SIM_THRESHOLD, top_k=MAX_SEMANTIC_SIM_TOP_K, columns=[ f'preprocessed_{document._.column}', f'preprocessed_{summary._.column}', ])])[0] for summary in summaries ] except KeyError: semantic_alignments = StaticEmbeddingAligner( semantic_sim_threshold, semantic_sim_top_k).align(document, summaries) else: semantic_alignments = [ filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k) for alignment in semantic_alignments ] else: try: semantic_alignments = [ BertscoreAlignerCap.decode(example.data[Identifier( BertscoreAlignerCap.__name__)( threshold=MIN_SEMANTIC_SIM_THRESHOLD, top_k=MAX_SEMANTIC_SIM_TOP_K, columns=[ f'preprocessed_{document._.column}', f'preprocessed_{summary._.column}', ])])[0] for summary in summaries ] except KeyError: semantic_alignments = BertscoreAligner(semantic_sim_threshold, semantic_sim_top_k).align( document, summaries) else: semantic_alignments = [ filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k) for alignment in semantic_alignments ] show_html(*main_view(document, summaries, semantic_alignments, lexical_alignments, layout, scroll, grey_stopwords), height=850)
def show_main(example): # Get user input semantic_sim_type = st.sidebar.radio( "Semantic similarity type:", ["Contextual embedding", "Static embedding"]) semantic_sim_threshold = st.sidebar.slider( "Semantic similarity threshold:", min_value=MIN_SEMANTIC_SIM_THRESHOLD, max_value=1.0, step=0.1, value=0.2, ) semantic_sim_top_k = st.sidebar.slider( "Semantic similarity top-k:", min_value=1, max_value=MAX_SEMANTIC_SIM_TOP_K, step=1, value=10, ) document, summaries = select_comparison(example) layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower() scroll = True gray_out_stopwords = st.sidebar.checkbox(label="Gray out stopwords", value=True) # Gather data try: lexical_alignments = [ example.data[ f'{NGramAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}'] for summary in summaries ] except KeyError: lexical_alignments = NGramAligner().align(document, summaries) if semantic_sim_type == "Static embedding": try: semantic_alignments = [ example.data[ f'{StaticEmbeddingAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}'] for summary in summaries ] except KeyError: semantic_alignments = StaticEmbeddingAligner( semantic_sim_threshold, semantic_sim_top_k).align(document, summaries) else: try: semantic_alignments = [ example.data[ f'{BertscoreAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}'] for summary in summaries ] except KeyError: semantic_alignments = BertscoreAligner(semantic_sim_threshold, semantic_sim_top_k).align( document, summaries) MainView( document, summaries, semantic_alignments, lexical_alignments, layout, scroll, gray_out_stopwords, ).show(height=720)