Example #1
0
def most_similar(tok, num_similar=20):
    """ FIXME: Use annoy to index word vectors and find most similar words to token str, id, or vector """
    raise NotImplementedError("Work in Progress, FIXME!")

    if _parse is None:
        nlp("hello", lang='en_core_web_lg')
        # FIXME: is the _parse variable updated by this, will WV and other globals work?
    stem = None
    idx = None
    vec = None

    if isinstance(tok, str):
        # TODO: if the tok is long or has punctuation/whitespace in it, use the spacy pipeline to tokenize it and compute the docvec
        stem = tok
        while stem not in WV_IDS:
            stem = stem[:-1]
        idx = WORD2ID[stem]
    elif isinstance(tok, int):
        idx = tok
        stem = VOCAB[idx].text
    else:
        vec = np.array(tok)
        if len(vec.shape) == 2:
            if vec.shape[0] == WV.shape[1]:
                vec = vec.mean(axis=0)
            else:
                vec = vec.mean(axis=1)
        idx = ANN.get_nns_by_vector(vec, 1)[0]
        stem = VOCAB[idx].text
    if idx is not None:
        vec = WV[idx]
        stem = VOCAB[idx].text
    return ANN.get_nns_by_vector(vec, num_similar)
Example #2
0
def split_sentences_spacy(text, language_model='en'):
    r""" You must download a spacy language model with python -m download 'en'

    The default English language model for spacy tends to be a lot more agressive than NLTK's punkt:

    >>> split_sentences_nltk("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm a wanna-\nbe human @ I.B.M.", ';) --Watson 2.0']
    >>> split_sentences_spacy("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm a wanna-", 'be human @', 'I.B.M. ;) --Watson 2.0']

    >>> split_sentences_spacy("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm at I.B.M. --Watson 2.0"]
    >>> split_sentences_nltk("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm at I.B.M.", '--Watson 2.0']
    """
    doc = nlp(text)
    sentences = []
    if not hasattr(doc, 'sents'):
        logger.warning(
            "Using NLTK sentence tokenizer because SpaCy language model hasn't been loaded"
        )
        return split_sentences_nltk(text)
    for w, span in enumerate(doc.sents):
        sent = ''.join(doc[i].string
                       for i in range(span.start, span.end)).strip()
        if len(sent):
            sentences.append(sent)
    return sentences
Example #3
0
def get_anki_vocab(lang=['eng'], limit=None, filename='anki_en_vocabulary.csv'):
    """ Get all the vocab words+tags+wordvectors for the tokens in the Anki translation corpus

    Returns a DataFrame of with columns = word, pos, tag, dep, ent, ent_iob, sentiment, vectors
    """
    texts = get_anki_phrases(lang=lang, limit=limit)
    docs = nlp(texts, lang=lang)
    vocab = get_vocab(docs)
    vocab['vector'] = get_word_vectors(vocab)  # TODO: turn this into a KeyedVectors object
    if filename:
        vocab.to_csv(os.path.join(BIGDATA_PATH, filename))
    return vocab