def most_similar(tok, num_similar=20): """ FIXME: Use annoy to index word vectors and find most similar words to token str, id, or vector """ raise NotImplementedError("Work in Progress, FIXME!") if _parse is None: nlp("hello", lang='en_core_web_lg') # FIXME: is the _parse variable updated by this, will WV and other globals work? stem = None idx = None vec = None if isinstance(tok, str): # TODO: if the tok is long or has punctuation/whitespace in it, use the spacy pipeline to tokenize it and compute the docvec stem = tok while stem not in WV_IDS: stem = stem[:-1] idx = WORD2ID[stem] elif isinstance(tok, int): idx = tok stem = VOCAB[idx].text else: vec = np.array(tok) if len(vec.shape) == 2: if vec.shape[0] == WV.shape[1]: vec = vec.mean(axis=0) else: vec = vec.mean(axis=1) idx = ANN.get_nns_by_vector(vec, 1)[0] stem = VOCAB[idx].text if idx is not None: vec = WV[idx] stem = VOCAB[idx].text return ANN.get_nns_by_vector(vec, num_similar)
def split_sentences_spacy(text, language_model='en'): r""" You must download a spacy language model with python -m download 'en' The default English language model for spacy tends to be a lot more agressive than NLTK's punkt: >>> split_sentences_nltk("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0") ['Hi Ms. Lovelace.', "I'm a wanna-\nbe human @ I.B.M.", ';) --Watson 2.0'] >>> split_sentences_spacy("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0") ['Hi Ms. Lovelace.', "I'm a wanna-", 'be human @', 'I.B.M. ;) --Watson 2.0'] >>> split_sentences_spacy("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0") ['Hi Ms. Lovelace.', "I'm at I.B.M. --Watson 2.0"] >>> split_sentences_nltk("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0") ['Hi Ms. Lovelace.', "I'm at I.B.M.", '--Watson 2.0'] """ doc = nlp(text) sentences = [] if not hasattr(doc, 'sents'): logger.warning( "Using NLTK sentence tokenizer because SpaCy language model hasn't been loaded" ) return split_sentences_nltk(text) for w, span in enumerate(doc.sents): sent = ''.join(doc[i].string for i in range(span.start, span.end)).strip() if len(sent): sentences.append(sent) return sentences
def get_anki_vocab(lang=['eng'], limit=None, filename='anki_en_vocabulary.csv'): """ Get all the vocab words+tags+wordvectors for the tokens in the Anki translation corpus Returns a DataFrame of with columns = word, pos, tag, dep, ent, ent_iob, sentiment, vectors """ texts = get_anki_phrases(lang=lang, limit=limit) docs = nlp(texts, lang=lang) vocab = get_vocab(docs) vocab['vector'] = get_word_vectors(vocab) # TODO: turn this into a KeyedVectors object if filename: vocab.to_csv(os.path.join(BIGDATA_PATH, filename)) return vocab