def test(n_docs = 100):
    db = MongoClient().patents
    w2v,kmeans = model_loader(300,200)
    def part_func(doc):
        return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}}
    for doc in db.traits.find({'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}}).limit(n_docs):
        pprint(part_func(doc))
Ejemplo n.º 2
0
def test():
    db = MongoClient().patents
    w2v,kmeans = model_loader(300,200)
    dists = {name: cluster_distances(db, pno, w2v,kmeans)
             for (name,pno) in _friendly_patents
    }
    return w2v, kmeans, dists, parse_clusters(kmeans, w2v)
Ejemplo n.º 3
0
def main():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        return {
            '$set': {
                'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,
                                                      kmeans)
            }
        }

    parallelMap(part_func,
                in_collection=db.traits,
                out_collection=db.traits,
                findArgs={
                    'spec': {
                        'doc_vec': {
                            '$exists': True,
                            '$nin': [[0 for _ in range(300)]]
                        },
                        'top_tf-idf': {
                            '$nin': [[]]
                        }
                    },
                    'fields': {
                        '_id': 1
                    }
                },
                updateFreq=500,
                bSize=1000)
Ejemplo n.º 4
0
def test():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)
    dists = {
        name: cluster_distances(db, pno, w2v, kmeans)
        for (name, pno) in _friendly_patents
    }
    return w2v, kmeans, dists, parse_clusters(kmeans, w2v)
Ejemplo n.º 5
0
def test(n_docs=100):
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        try:
            return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}}
        except:
            return {"$set": {"wordvec_clusters": []}}

    for doc in db.traits.find().limit(n_docs):
        pprint(part_func(doc))
def main():
    db = MongoClient().patents
    w2v,kmeans = model_loader(300,200)
    def part_func(doc):
        return {'$set': {'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,kmeans)}}
    parallelMap(
        part_func,
        in_collection = db.traits,
        out_collection = db.traits,
        findArgs = {
            'spec': {'doc_vec': {'$exists': True, '$nin': [[0 for _ in range(300)]]}, 'top_tf-idf': {'$nin': [[]]}},
            'fields': {'_id': 1}
        },
        updateFreq=500,
        bSize=1000
    )
Ejemplo n.º 7
0
def main():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        try:
            return {"$set": {"wordvec_clusters": cluster_distances(db, doc["_id"], w2v, kmeans)}}
        except:
            return {"$set": {"wordvec_clusters": []}}

    parallelMap(
        part_func,
        in_collection=db.traits,
        out_collection=db.traits,
        findArgs={"spec": {}, "fields": {}},
        updateFreq=500,
        bSize=1000,
    )
Ejemplo n.º 8
0
def test(n_docs=100):
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)

    def part_func(doc):
        return {
            '$set': {
                'wordvec_clusters': cluster_distances(db, doc['_id'], w2v,
                                                      kmeans)
            }
        }

    for doc in db.traits.find({
            'doc_vec': {
                '$exists': True,
                '$nin': [[0 for _ in range(300)]]
            },
            'top_tf-idf': {
                '$nin': [[]]
            }
    }).limit(n_docs):
        pprint(part_func(doc))
Ejemplo n.º 9
0
    dists = {name: cluster_distances(db, pno, w2v_model, cluster_model)
             for name,pno in pnos}
    cluster_parse_fn = '/'.join([outdir, 'parsed_clusters.p'])
    dist_fn = '/'.join([outdir, 'patent_dists.p'])
    tsne_fn = '/'.join([outdir, 'embedding_fig_tsne.png'])
    save_dict(cluster_parse_fn, parsed_clusters)
    save_dict(dist_fn, dists)
    embedding_fig(w2v_model, cluster_model, savefn = tsne_fn, n=150)

def test():
    db = MongoClient().patents
    w2v,kmeans = model_loader(300,200)
    dists = {name: cluster_distances(db, pno, w2v,kmeans)
             for (name,pno) in _friendly_patents
    }
    return w2v, kmeans, dists, parse_clusters(kmeans, w2v)
    
if __name__ == '__main__':
    if len(sys.argv) != 4:
        exit("Usage: python {} <wordvec_size> <n_clusters> <out directory>".format(sys.argv[0]))
    db = MongoClient().patents
    w2v_dim = int(sys.argv[1])
    n_clusters = int(sys.argv[2])
    outdir = sys.argv[3]
    try:
        w2v,kmeans = model_loader(w2v_dim, n_clusters)
    except:
        exit("Model with vector dimension {} and {} clusters not found.".format(w2v_dim, n_clusters))
    model_report(db, _friendly_patents, w2v, kmeans, outdir)
    
Ejemplo n.º 10
0
    save_dict(cluster_parse_fn, parsed_clusters)
    save_dict(dist_fn, dists)
    embedding_fig(w2v_model, cluster_model, savefn=tsne_fn, n=150)


def test():
    db = MongoClient().patents
    w2v, kmeans = model_loader(300, 200)
    dists = {
        name: cluster_distances(db, pno, w2v, kmeans)
        for (name, pno) in _friendly_patents
    }
    return w2v, kmeans, dists, parse_clusters(kmeans, w2v)


if __name__ == '__main__':
    if len(sys.argv) != 4:
        exit("Usage: python {} <wordvec_size> <n_clusters> <out directory>".
             format(sys.argv[0]))
    db = MongoClient().patents
    w2v_dim = int(sys.argv[1])
    n_clusters = int(sys.argv[2])
    outdir = sys.argv[3]
    try:
        w2v, kmeans = model_loader(w2v_dim, n_clusters)
    except:
        exit(
            "Model with vector dimension {} and {} clusters not found.".format(
                w2v_dim, n_clusters))
    model_report(db, _friendly_patents, w2v, kmeans, outdir)