def test(n_docs = 100): db = MongoClient().patents w2v,kmeans = model_loader(300,200) def part_func(doc): return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}} for doc in db.traits.find({'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}}).limit(n_docs): pprint(part_func(doc))
def test(): db = MongoClient().patents w2v,kmeans = model_loader(300,200) dists = {name: cluster_distances(db, pno, w2v,kmeans) for (name,pno) in _friendly_patents } return w2v, kmeans, dists, parse_clusters(kmeans, w2v)
def main(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): return { '$set': { 'wordvec_clusters': cluster_distances(db, doc['_id'], w2v, kmeans) } } parallelMap(part_func, in_collection=db.traits, out_collection=db.traits, findArgs={ 'spec': { 'doc_vec': { '$exists': True, '$nin': [[0 for _ in range(300)]] }, 'top_tf-idf': { '$nin': [[]] } }, 'fields': { '_id': 1 } }, updateFreq=500, bSize=1000)
def test(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) dists = { name: cluster_distances(db, pno, w2v, kmeans) for (name, pno) in _friendly_patents } return w2v, kmeans, dists, parse_clusters(kmeans, w2v)
def test(n_docs=100): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): try: return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}} except: return {"$set": {"wordvec_clusters": []}} for doc in db.traits.find().limit(n_docs): pprint(part_func(doc))
def main(): db = MongoClient().patents w2v,kmeans = model_loader(300,200) def part_func(doc): return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}} parallelMap( part_func, in_collection = db.traits, out_collection = db.traits, findArgs = { 'spec': {'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}}, 'fields': {'_id': 1} }, updateFreq=500, bSize=1000 )
def main(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): try: return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}} except: return {"$set": {"wordvec_clusters": []}} parallelMap( part_func, in_collection=db.traits, out_collection=db.traits, findArgs={"spec": {}, "fields": {}}, updateFreq=500, bSize=1000, )
def test(n_docs=100): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) def part_func(doc): return { '$set': { 'wordvec_clusters': cluster_distances(db, doc['_id'], w2v, kmeans) } } for doc in db.traits.find({ 'doc_vec': { '$exists': True, '$nin': [[0 for _ in range(300)]] }, 'top_tf-idf': { '$nin': [[]] } }).limit(n_docs): pprint(part_func(doc))
dists = {name: cluster_distances(db, pno, w2v_model, cluster_model) for name,pno in pnos} cluster_parse_fn = '/'.join([outdir, 'parsed_clusters.p']) dist_fn = '/'.join([outdir, 'patent_dists.p']) tsne_fn = '/'.join([outdir, 'embedding_fig_tsne.png']) save_dict(cluster_parse_fn, parsed_clusters) save_dict(dist_fn, dists) embedding_fig(w2v_model, cluster_model, savefn = tsne_fn, n=150) def test(): db = MongoClient().patents w2v,kmeans = model_loader(300,200) dists = {name: cluster_distances(db, pno, w2v,kmeans) for (name,pno) in _friendly_patents } return w2v, kmeans, dists, parse_clusters(kmeans, w2v) if __name__ == '__main__': if len(sys.argv) != 4: exit("Usage: python {} <wordvec_size> <n_clusters> <out directory>".format(sys.argv[0])) db = MongoClient().patents w2v_dim = int(sys.argv[1]) n_clusters = int(sys.argv[2]) outdir = sys.argv[3] try: w2v,kmeans = model_loader(w2v_dim, n_clusters) except: exit("Model with vector dimension {} and {} clusters not found.".format(w2v_dim, n_clusters)) model_report(db, _friendly_patents, w2v, kmeans, outdir)
save_dict(cluster_parse_fn, parsed_clusters) save_dict(dist_fn, dists) embedding_fig(w2v_model, cluster_model, savefn=tsne_fn, n=150) def test(): db = MongoClient().patents w2v, kmeans = model_loader(300, 200) dists = { name: cluster_distances(db, pno, w2v, kmeans) for (name, pno) in _friendly_patents } return w2v, kmeans, dists, parse_clusters(kmeans, w2v) if __name__ == '__main__': if len(sys.argv) != 4: exit("Usage: python {} <wordvec_size> <n_clusters> <out directory>". format(sys.argv[0])) db = MongoClient().patents w2v_dim = int(sys.argv[1]) n_clusters = int(sys.argv[2]) outdir = sys.argv[3] try: w2v, kmeans = model_loader(w2v_dim, n_clusters) except: exit( "Model with vector dimension {} and {} clusters not found.".format( w2v_dim, n_clusters)) model_report(db, _friendly_patents, w2v, kmeans, outdir)