Esempio n. 1
0
def visualize_similarity(
    nlp: spacy.language.Language,
    default_texts: Tuple[str, str] = ("apple", "orange"),
    *,
    threshold: float = 0.5,
    title: Optional[str] = "Vectors & Similarity",
    key: Optional[str] = None,
) -> None:
    """Visualizer for semantic similarity using word vectors."""
    meta = nlp.meta.get("vectors", {})
    if title:
        st.header(title)
    if not meta.get("width", 0):
        st.warning("No vectors available in the model.")
    else:
        cols = st.beta_columns(2)
        text1 = cols[0].text_input("Text or word 1",
                                   default_texts[0],
                                   key=f"{key}_similarity_text1")
        text2 = cols[1].text_input("Text or word 2",
                                   default_texts[1],
                                   key=f"{key}_similarity_text2")
        doc1 = nlp.make_doc(text1)
        doc2 = nlp.make_doc(text2)
        similarity = doc1.similarity(doc2)
        similarity_text = f"**Score:** `{similarity}`"
        if similarity > threshold:
            st.success(similarity_text)
        else:
            st.error(similarity_text)

        exp = st.beta_expander("Vector information")
        exp.code(meta)
def segment_documents(nlp: spacy.language.Language, texts: List[str],
                      section_titles: List[str]):
    """Segment XML sentences and generate a list of sentences"""

    docs = nlp.pipe(texts)

    sentence_id = 1

    out = {'inputs': [], 'section_names': [], 'section_lengths': []}

    # iterate over sections in paper
    for section_name, doc in zip(section_titles, docs):

        sentcount = 0

        # iterate over sentences in section
        for sent in doc.sents:
            tokens = [tok.text for tok in sent]
            input_doc = {
                "text": sent.text,
                "tokens": tokens,
                "sentence_id": sentence_id,
                "word_count": len(tokens)
            }

            out['inputs'].append(input_doc)

            sentence_id += 1
            sentcount += 1

        # end iterating over sentences
        out['section_lengths'].append(sentcount)
        out['section_names'].append(section_name)

    return out
Esempio n. 3
0
def train_pipeline(nlp: spacy.language.Language) -> None:
    """Train a `spacy.language.Language` instance."""
    if TEXTCAT not in nlp.pipe_names:
        textcat = nlp.create_pipe(TEXTCAT, config={"exclusive_classes": False})
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe(TEXTCAT)

    for category in CATEGORIES:
        textcat.add_label(category.value)

    pipe_exceptions = {TEXTCAT, "trf_wordpiecer", "trf_tok2vec"}
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        all_data = list(get_classification_training_data())
        random.shuffle(all_data)

        training_data = all_data[:len(all_data) - 2]
        validation_data = all_data[len(all_data) - 2:]

        optimizer = nlp.begin_training()
        for itn in range(20):
            losses: Dict[str, Any] = {}
            random.shuffle(training_data)
            batches = minibatch(training_data,
                                size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
Esempio n. 4
0
def visualize_similarity(
    nlp: spacy.language.Language,
    default_texts: Tuple[str, str] = ("apple", "orange"),
    *,
    threshold: float = 0.5,
    title: Optional[str] = "Vectors & Similarity",
) -> None:
    """Visualizer for semantic similarity using word vectors."""
    meta = nlp.meta.get("vectors", {})
    if title:
        st.header(title)
    if not meta.get("width", 0):
        st.warning("No vectors available in the model.")
    st.code(meta)
    text1 = st.text_input("Text or word 1", default_texts[0])
    text2 = st.text_input("Text or word 2", default_texts[1])
    doc1 = nlp.make_doc(text1)
    doc2 = nlp.make_doc(text2)
    similarity = doc1.similarity(doc2)
    if similarity > threshold:
        st.success(similarity)
    else:
        st.error(similarity)