Esempio n. 1
0
def run_workflow(jsonl_path: str,
                 doc_column: str = None,
                 reference_column: str = None,
                 summary_columns: List[str] = None,
                 bert_aligner_threshold: float = 0.5,
                 bert_aligner_top_k: int = 3,
                 embedding_aligner_threshold: float = 0.5,
                 embedding_aligner_top_k: int = 3,
                 processed_dataset_path: str = None,
                 n_samples: int = None):
    if not jsonl_path:
        raise ValueError("'jsonl_path' is required")

    if not processed_dataset_path:
        raise ValueError("Please specify a path to save the dataset.")

    # Load the dataset
    dataset = DataPanel.from_jsonl(jsonl_path)

    if doc_column is None:
        # Assume `doc_column` is called "document"
        doc_column = 'document'
        assert doc_column in dataset.columns, \
            f"`doc_column={doc_column}` is not a column in datapanel."
        print("Assuming `doc_column` is called 'document'.")

    if reference_column is None:
        # Assume `reference_column` is called "summary:reference"
        reference_column = 'summary:reference'
        print("Assuming `reference_column` is called 'summary:reference'.")
        if reference_column not in dataset.columns:
            print("No reference summary loaded")
            reference_column = None

    if summary_columns is None or len(summary_columns) == 0:
        # Assume `summary_columns` are prefixed by "summary:"
        summary_columns = []
        for col in dataset.columns:
            if col.startswith("summary:") and col != "summary:reference":
                summary_columns.append(col)
        print(
            f"Reading summary columns from datapanel. Found {summary_columns}."
        )

    if len(summary_columns) == 0 and reference_column is None:
        raise ValueError("At least one summary is required")

    # Restrict to the first `n_samples`
    if n_samples:
        print(f"Restricting to {n_samples} samples.")
        dataset = dataset.head(n_samples)

    print("size of dataset:", len(dataset))

    # Combine the text columns into one list
    text_columns = [doc_column] + ([reference_column] if reference_column else
                                   []) + summary_columns

    # Preprocessing all the text columns
    print("Preprocessing text columns")
    dataset = dataset.update(
        lambda x: {
            f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k])
            for k in text_columns
        })

    # Run the Spacy pipeline on all preprocessed text columns
    nlp = load_nlp()

    nlp.add_pipe('sentencizer', before="parser")

    print("Running spacy processing")
    for col in text_columns:
        dataset.add_column(
            f'spacy:{col}',
            SpacyColumn.from_docs(nlp.pipe(dataset[f'preprocessed_{col}'])))

    # Run the 3 align pipelines
    bert_aligner = BertscoreAligner(
        threshold=bert_aligner_threshold,
        top_k=bert_aligner_top_k,
    )

    embedding_aligner = StaticEmbeddingAligner(
        threshold=embedding_aligner_threshold,
        top_k=embedding_aligner_top_k,
    )

    ngram_aligner = NGramAligner()

    dataset = _run_aligners(
        dataset=dataset,
        aligners=[bert_aligner, embedding_aligner, ngram_aligner],
        doc_column=f'spacy:{doc_column}',
        reference_column=f'spacy:{reference_column}'
        if reference_column else None,
        summary_columns=[f'spacy:{col}' for col in summary_columns],
    )

    # Save the dataset
    dataset.write(processed_dataset_path)

    return dataset
Esempio n. 2
0
 def __init__(
     self,
     spacy,
 ):
     super(NGramAlignerCap, self).__init__(aligner=NGramAligner(),
                                           spacy=spacy)
Esempio n. 3
0
def show_main(example):
    # Get user input

    semantic_sim_type = st.sidebar.radio(
        "Semantic similarity type:",
        ["Contextual embedding", "Static embedding"])
    semantic_sim_threshold = st.sidebar.slider(
        "Semantic similarity threshold:",
        min_value=MIN_SEMANTIC_SIM_THRESHOLD,
        max_value=1.0,
        step=0.1,
        value=0.2,
    )
    semantic_sim_top_k = st.sidebar.slider(
        "Semantic similarity top-k:",
        min_value=1,
        max_value=MAX_SEMANTIC_SIM_TOP_K,
        step=1,
        value=10,
    )

    document, summaries = select_comparison(example)
    layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower()
    # if layout == "horizontal":
    #     scroll = st.sidebar.checkbox(label="Scroll sections", value=True)
    # else:
    scroll = True
    grey_stopwords = st.sidebar.checkbox(label="Grey out stopwords",
                                         value=True)

    # Gather data
    try:
        lexical_alignments = [
            NGramAlignerCap.decode(example.data[Identifier(
                NGramAlignerCap.__name__)(columns=[
                    f'preprocessed_{document._.column}',
                    f'preprocessed_{summary._.column}',
                ])])[0] for summary in summaries
        ]
        lexical_alignments = [{
            k: [(pair[0], int(pair[1])) for pair in v]
            for k, v in d.items()
        } for d in lexical_alignments]
    except KeyError:
        lexical_alignments = NGramAligner().align(document, summaries)

    if semantic_sim_type == "Static embedding":
        try:
            semantic_alignments = [
                StaticEmbeddingAlignerCap.decode(example.data[Identifier(
                    StaticEmbeddingAlignerCap.__name__)(
                        threshold=MIN_SEMANTIC_SIM_THRESHOLD,
                        top_k=MAX_SEMANTIC_SIM_TOP_K,
                        columns=[
                            f'preprocessed_{document._.column}',
                            f'preprocessed_{summary._.column}',
                        ])])[0] for summary in summaries
            ]
        except KeyError:
            semantic_alignments = StaticEmbeddingAligner(
                semantic_sim_threshold,
                semantic_sim_top_k).align(document, summaries)
        else:
            semantic_alignments = [
                filter_alignment(alignment, semantic_sim_threshold,
                                 semantic_sim_top_k)
                for alignment in semantic_alignments
            ]
    else:
        try:
            semantic_alignments = [
                BertscoreAlignerCap.decode(example.data[Identifier(
                    BertscoreAlignerCap.__name__)(
                        threshold=MIN_SEMANTIC_SIM_THRESHOLD,
                        top_k=MAX_SEMANTIC_SIM_TOP_K,
                        columns=[
                            f'preprocessed_{document._.column}',
                            f'preprocessed_{summary._.column}',
                        ])])[0] for summary in summaries
            ]
        except KeyError:
            semantic_alignments = BertscoreAligner(semantic_sim_threshold,
                                                   semantic_sim_top_k).align(
                                                       document, summaries)
        else:
            semantic_alignments = [
                filter_alignment(alignment, semantic_sim_threshold,
                                 semantic_sim_top_k)
                for alignment in semantic_alignments
            ]

    show_html(*main_view(document, summaries, semantic_alignments,
                         lexical_alignments, layout, scroll, grey_stopwords),
              height=850)
Esempio n. 4
0
def show_main(example):
    # Get user input

    semantic_sim_type = st.sidebar.radio(
        "Semantic similarity type:",
        ["Contextual embedding", "Static embedding"])
    semantic_sim_threshold = st.sidebar.slider(
        "Semantic similarity threshold:",
        min_value=MIN_SEMANTIC_SIM_THRESHOLD,
        max_value=1.0,
        step=0.1,
        value=0.2,
    )
    semantic_sim_top_k = st.sidebar.slider(
        "Semantic similarity top-k:",
        min_value=1,
        max_value=MAX_SEMANTIC_SIM_TOP_K,
        step=1,
        value=10,
    )

    document, summaries = select_comparison(example)
    layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower()
    scroll = True
    gray_out_stopwords = st.sidebar.checkbox(label="Gray out stopwords",
                                             value=True)

    # Gather data
    try:
        lexical_alignments = [
            example.data[
                f'{NGramAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
            for summary in summaries
        ]
    except KeyError:
        lexical_alignments = NGramAligner().align(document, summaries)

    if semantic_sim_type == "Static embedding":
        try:
            semantic_alignments = [
                example.data[
                    f'{StaticEmbeddingAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
                for summary in summaries
            ]
        except KeyError:
            semantic_alignments = StaticEmbeddingAligner(
                semantic_sim_threshold,
                semantic_sim_top_k).align(document, summaries)
    else:
        try:
            semantic_alignments = [
                example.data[
                    f'{BertscoreAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
                for summary in summaries
            ]
        except KeyError:
            semantic_alignments = BertscoreAligner(semantic_sim_threshold,
                                                   semantic_sim_top_k).align(
                                                       document, summaries)

    MainView(
        document,
        summaries,
        semantic_alignments,
        lexical_alignments,
        layout,
        scroll,
        gray_out_stopwords,
    ).show(height=720)