Example #1
0
def parse_clusters(kmeans, w2v, n_get = 50, n_try = 20, stemmeq=True):
    """
    parses a kmeans model by returning the closest word vectors. 
    if stemmeq is True, then keep searching through the closest
    words until you get up to n_try which do not have the same stem.
    """ 
    centers = kmeans.cluster_centers_
    if not stemmeq:
        parsed = {i:','.join(
            map(lambda x: x[0], w2v.most_similar([center], topn=n_try))
        ) 
                  for i,center in enumerate(centers)}
    else:
        parsed = defaultdict()
        for i,center in enumerate(centers):
            terms = []
            topn_get = w2v.most_similar([center], topn = n_get)
            for term in topn_get:
                if len(terms) == n_try:
                    break
                if stemmer(term[0]) not in [stemmer(t) for t in terms]:
                    terms.append(term[0])
                else:
                   continue 
            parsed[i] = terms
    return parsed
Example #2
0
def parse_clusters(kmeans, w2v, n_get=50, n_try=20, stemmeq=True):
    """
    parses a kmeans model by returning the closest word vectors. 
    if stemmeq is True, then keep searching through the closest
    words until you get up to n_try which do not have the same stem.
    """
    centers = kmeans.cluster_centers_
    if not stemmeq:
        parsed = {
            i: ','.join(
                map(lambda x: x[0], w2v.most_similar([center], topn=n_try)))
            for i, center in enumerate(centers)
        }
    else:
        parsed = defaultdict()
        for i, center in enumerate(centers):
            terms = []
            topn_get = w2v.most_similar([center], topn=n_get)
            for term in topn_get:
                if len(terms) == n_try:
                    break
                if stemmer(term[0]) not in [stemmer(t) for t in terms]:
                    terms.append(term[0])
                else:
                    continue
            parsed[i] = terms
    return parsed
Example #3
0
def tfidf_weighted_avg(pno, w2v_model, db):
    """
    computes the tfidf-weighted average representation of a doc 
    in a given word2vec model.
    This is poorly implemented in that it makes two database queries. Ugh. 
    """
    text = db.pat_text.find_one({'_id': pno}).get('patText', '')
    if text == '':
        raise RuntimeError('doc has no text.')
    words = _tokenizer.tokenize(text)
    stemmed_words = [stemmer(word) for word in words]
    try:
        bow = db.patns.find_one({'pno': pno}).get('text', {})
    except:
        raise RuntimeError("No patent {} in {}".format(pno, db.patns))


#    print bow
    tfidfs = [bow.get(stem, {}).get('tf-idf', 0) for stem in stemmed_words]

    def getvec(word, model):
        try:
            return model[word]
        except:
            return np.zeros(len(model['dna']))

    vecs = [getvec(word, w2v_model) for word in words]
    weighted_vecs = np.array(
        [vec * tfidf for (vec, tfidf) in zip(vecs, tfidfs)])
    assert (len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) ==
            len(weighted_vecs))
    return 1. / len(words) * np.sum(weighted_vecs, axis=0)
Example #4
0
def tfidf_weighted_avg(pno, w2v_model, db):
    """
    computes the tfidf-weighted average representation of a doc 
    in a given word2vec model.
    This is poorly implemented in that it makes two database queries. Ugh. 
    """
    text = db.pat_text.find_one({'_id': pno}).get('patText', '')
    if text == '':
        raise RuntimeError('doc has no text.')
    words = _tokenizer.tokenize(text)    
    stemmed_words = [stemmer(word) for word in words]
    try:
        bow = db.patns.find_one({'pno': pno}).get('text', {})
    except: 
        raise RuntimeError("No patent {} in {}".format(pno, db.patns))
#    print bow
    tfidfs = [bow.get(stem,{}).get('tf-idf',0) for stem in stemmed_words]
    def getvec(word,model):
        try:
            return model[word]
        except:
            return np.zeros(len(model['dna']))
    vecs = [getvec(word,w2v_model) for word in words]
    weighted_vecs = np.array([vec*tfidf for (vec,tfidf) in zip(vecs,tfidfs)])
    assert(len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) == len(weighted_vecs))
    return 1./len(words)*np.sum(weighted_vecs, axis=0)
Example #5
0
def tfidf_weighted_avg(pno, w2v_model, db):
    """
    computes the tfidf-weighted average representation of a doc 
    in a given word2vec model. #TODO: write this in a way that doesn't
    make two database queries.
    """
    text = db.pat_text.find_one({'_id': pno}).get('patText', '')
    if text == '':
        raise RuntimeError('doc has no text.')
    # words is a list of words in the text. 
    words = _tokenizer.tokenize(text)    
    stemmed_words = [stemmer(word) for word in words]
    try:
        # bow containts tfidf stats for each word in the doc. 
        bow = db.patns.find_one({'pno': pno}).get('text', {})
    except: 
        raise RuntimeError("No patent {} in {}".format(pno, db.patns))
    # tfidfs for each word. Lines up with words and stemmed_words. 0 indicates term does not occur.
    tfidfs = [bow.get(stem,{}).get('tf-idf',0) for stem in stemmed_words]

    # We don't want any zero vectors in the db. 
    if all(x == 0 for x in tfidfs) or len(tfidfs) == 0:
        return []

    def getvec(word,model):
        try:
            return model[word]
        except:
            return np.zeros(len(model['dna']))

    vecs = [getvec(word,w2v_model) for word in words]
    weighted_vecs = np.array([vec*tfidf for (vec,tfidf) in zip(vecs,tfidfs)])
    assert(len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) == len(weighted_vecs))
    docvec = 1./len(words)*np.sum(weighted_vecs, axis=0)
    if all(x == 0 for x in docvec):
        return []
    else:
        return docvec