def visualize_similarity( nlp: spacy.language.Language, default_texts: Tuple[str, str] = ("apple", "orange"), *, threshold: float = 0.5, title: Optional[str] = "Vectors & Similarity", key: Optional[str] = None, ) -> None: """Visualizer for semantic similarity using word vectors.""" meta = nlp.meta.get("vectors", {}) if title: st.header(title) if not meta.get("width", 0): st.warning("No vectors available in the model.") else: cols = st.beta_columns(2) text1 = cols[0].text_input("Text or word 1", default_texts[0], key=f"{key}_similarity_text1") text2 = cols[1].text_input("Text or word 2", default_texts[1], key=f"{key}_similarity_text2") doc1 = nlp.make_doc(text1) doc2 = nlp.make_doc(text2) similarity = doc1.similarity(doc2) similarity_text = f"**Score:** `{similarity}`" if similarity > threshold: st.success(similarity_text) else: st.error(similarity_text) exp = st.beta_expander("Vector information") exp.code(meta)
def segment_documents(nlp: spacy.language.Language, texts: List[str], section_titles: List[str]): """Segment XML sentences and generate a list of sentences""" docs = nlp.pipe(texts) sentence_id = 1 out = {'inputs': [], 'section_names': [], 'section_lengths': []} # iterate over sections in paper for section_name, doc in zip(section_titles, docs): sentcount = 0 # iterate over sentences in section for sent in doc.sents: tokens = [tok.text for tok in sent] input_doc = { "text": sent.text, "tokens": tokens, "sentence_id": sentence_id, "word_count": len(tokens) } out['inputs'].append(input_doc) sentence_id += 1 sentcount += 1 # end iterating over sentences out['section_lengths'].append(sentcount) out['section_names'].append(section_name) return out
def train_pipeline(nlp: spacy.language.Language) -> None: """Train a `spacy.language.Language` instance.""" if TEXTCAT not in nlp.pipe_names: textcat = nlp.create_pipe(TEXTCAT, config={"exclusive_classes": False}) nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe(TEXTCAT) for category in CATEGORIES: textcat.add_label(category.value) pipe_exceptions = {TEXTCAT, "trf_wordpiecer", "trf_tok2vec"} other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train textcat all_data = list(get_classification_training_data()) random.shuffle(all_data) training_data = all_data[:len(all_data) - 2] validation_data = all_data[len(all_data) - 2:] optimizer = nlp.begin_training() for itn in range(20): losses: Dict[str, Any] = {} random.shuffle(training_data) batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
def visualize_similarity( nlp: spacy.language.Language, default_texts: Tuple[str, str] = ("apple", "orange"), *, threshold: float = 0.5, title: Optional[str] = "Vectors & Similarity", ) -> None: """Visualizer for semantic similarity using word vectors.""" meta = nlp.meta.get("vectors", {}) if title: st.header(title) if not meta.get("width", 0): st.warning("No vectors available in the model.") st.code(meta) text1 = st.text_input("Text or word 1", default_texts[0]) text2 = st.text_input("Text or word 2", default_texts[1]) doc1 = nlp.make_doc(text1) doc2 = nlp.make_doc(text2) similarity = doc1.similarity(doc2) if similarity > threshold: st.success(similarity) else: st.error(similarity)