def parse_clusters(kmeans, w2v, n_get = 50, n_try = 20, stemmeq=True): """ parses a kmeans model by returning the closest word vectors. if stemmeq is True, then keep searching through the closest words until you get up to n_try which do not have the same stem. """ centers = kmeans.cluster_centers_ if not stemmeq: parsed = {i:','.join( map(lambda x: x[0], w2v.most_similar([center], topn=n_try)) ) for i,center in enumerate(centers)} else: parsed = defaultdict() for i,center in enumerate(centers): terms = [] topn_get = w2v.most_similar([center], topn = n_get) for term in topn_get: if len(terms) == n_try: break if stemmer(term[0]) not in [stemmer(t) for t in terms]: terms.append(term[0]) else: continue parsed[i] = terms return parsed
def parse_clusters(kmeans, w2v, n_get=50, n_try=20, stemmeq=True): """ parses a kmeans model by returning the closest word vectors. if stemmeq is True, then keep searching through the closest words until you get up to n_try which do not have the same stem. """ centers = kmeans.cluster_centers_ if not stemmeq: parsed = { i: ','.join( map(lambda x: x[0], w2v.most_similar([center], topn=n_try))) for i, center in enumerate(centers) } else: parsed = defaultdict() for i, center in enumerate(centers): terms = [] topn_get = w2v.most_similar([center], topn=n_get) for term in topn_get: if len(terms) == n_try: break if stemmer(term[0]) not in [stemmer(t) for t in terms]: terms.append(term[0]) else: continue parsed[i] = terms return parsed
def tfidf_weighted_avg(pno, w2v_model, db): """ computes the tfidf-weighted average representation of a doc in a given word2vec model. This is poorly implemented in that it makes two database queries. Ugh. """ text = db.pat_text.find_one({'_id': pno}).get('patText', '') if text == '': raise RuntimeError('doc has no text.') words = _tokenizer.tokenize(text) stemmed_words = [stemmer(word) for word in words] try: bow = db.patns.find_one({'pno': pno}).get('text', {}) except: raise RuntimeError("No patent {} in {}".format(pno, db.patns)) # print bow tfidfs = [bow.get(stem, {}).get('tf-idf', 0) for stem in stemmed_words] def getvec(word, model): try: return model[word] except: return np.zeros(len(model['dna'])) vecs = [getvec(word, w2v_model) for word in words] weighted_vecs = np.array( [vec * tfidf for (vec, tfidf) in zip(vecs, tfidfs)]) assert (len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) == len(weighted_vecs)) return 1. / len(words) * np.sum(weighted_vecs, axis=0)
def tfidf_weighted_avg(pno, w2v_model, db): """ computes the tfidf-weighted average representation of a doc in a given word2vec model. This is poorly implemented in that it makes two database queries. Ugh. """ text = db.pat_text.find_one({'_id': pno}).get('patText', '') if text == '': raise RuntimeError('doc has no text.') words = _tokenizer.tokenize(text) stemmed_words = [stemmer(word) for word in words] try: bow = db.patns.find_one({'pno': pno}).get('text', {}) except: raise RuntimeError("No patent {} in {}".format(pno, db.patns)) # print bow tfidfs = [bow.get(stem,{}).get('tf-idf',0) for stem in stemmed_words] def getvec(word,model): try: return model[word] except: return np.zeros(len(model['dna'])) vecs = [getvec(word,w2v_model) for word in words] weighted_vecs = np.array([vec*tfidf for (vec,tfidf) in zip(vecs,tfidfs)]) assert(len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) == len(weighted_vecs)) return 1./len(words)*np.sum(weighted_vecs, axis=0)
def tfidf_weighted_avg(pno, w2v_model, db): """ computes the tfidf-weighted average representation of a doc in a given word2vec model. #TODO: write this in a way that doesn't make two database queries. """ text = db.pat_text.find_one({'_id': pno}).get('patText', '') if text == '': raise RuntimeError('doc has no text.') # words is a list of words in the text. words = _tokenizer.tokenize(text) stemmed_words = [stemmer(word) for word in words] try: # bow containts tfidf stats for each word in the doc. bow = db.patns.find_one({'pno': pno}).get('text', {}) except: raise RuntimeError("No patent {} in {}".format(pno, db.patns)) # tfidfs for each word. Lines up with words and stemmed_words. 0 indicates term does not occur. tfidfs = [bow.get(stem,{}).get('tf-idf',0) for stem in stemmed_words] # We don't want any zero vectors in the db. if all(x == 0 for x in tfidfs) or len(tfidfs) == 0: return [] def getvec(word,model): try: return model[word] except: return np.zeros(len(model['dna'])) vecs = [getvec(word,w2v_model) for word in words] weighted_vecs = np.array([vec*tfidf for (vec,tfidf) in zip(vecs,tfidfs)]) assert(len(tfidfs) == len(words) == len(stemmed_words) == len(vecs) == len(weighted_vecs)) docvec = 1./len(words)*np.sum(weighted_vecs, axis=0) if all(x == 0 for x in docvec): return [] else: return docvec