def test(limit=100):
    # Get filenames.
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name+'.lda'])
    corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_'+name+'.dict'])

    # Load persisted data from disk. 
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p:i for i,p in enumerate(pnos)}

    #produce visualization... commented out for now. keeps crashing the machine. 
#    print "producing visualization..."
#    visfn = '/'.join([indir, 'vis.html'])
#    vis_data = prepare(lda, corpus, vocab)
#    print "saving visualization..."
#    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db. 
    print "Getting doc topics..."
    assert(len(corpus) == len(pnos))
    db = MongoClient().patents
    def partfunc(doc):
        topics = lda[corpus[pno2id[doc['_id']]]]
        return {'$set': {'lda_topics': topics}}
    pats_test = db.traits.find().limit(limit)
    for p in pats_test:
        pprint(partfunc(p))
    print "\nDone."
Esempio n. 2
0
def load_pops(start_time, limit = None):
    date_str = dt_as_str(start_time)
    popfn = '/'.join([_pop_dir, date_str+'.p'])
    doc = load_obj(popfn)
    if limit is not None:
        return doc['new_ancestors'], doc['descendants']
    else:
        return doc['new_ancestors'][:limit], doc['descendants'][:limit]
Esempio n. 3
0
def load_pops(start_time, limit=None):
    date_str = dt_as_str(start_time)
    popfn = '/'.join([_pop_dir, date_str + '.p'])
    doc = load_obj(popfn)
    if limit is not None:
        return doc['new_ancestors'], doc['descendants']
    else:
        return doc['new_ancestors'][:limit], doc['descendants'][:limit]
Esempio n. 4
0
def main():
    db = MongoClient().patents
    family_names = [
        "stents",
        "zeolites",
        "bubblejet",
        "cellphone",
        "pcr",
        "microarrays",
        "semiconductors",
        "nonwovenwebs",
        "rsa",
        "browser",
    ]
    family_pnos = [4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563, 4405829, 5572643]
    family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250]
    lilfriend_names = [
        "skate",
        "murphybed",
        "hummingbirdfeeder",
        "telescopicumbrella",
        "hybridengine",
        "minesweeper",
        "humanoidrobot",
        "recumbentbike",
        "hangglider",
        "ziplock",
    ]
    lilfriend_pnos = [6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351, 4417707, 6004032]
    lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    bigfriend_names = [
        "dentallaser",
        "ballisticvest",
        "hungryhippos",
        "sharkprod",
        "gatlinggun",
        "nuclearwastetreatment",
        "gfp",
        "roughterrainchasis",
        "bowflex",
        "radaraltimeter",
    ]
    bigfriend_pnos = [5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199, 4725057, 4945360]
    bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10]
    names = family_names + lilfriend_names + bigfriend_names
    pnos = family_pnos + lilfriend_pnos + bigfriend_pnos
    thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds
    for pno, threshold, name in zip(pnos, thresholds, names):
        print "getting lineage for patent {} ({}), with threhold {}.".format(pno, name, threshold)
        lineage = crawl_lineage(
            db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True
        )
        adj = subnet_adj_dict(lineage)
        dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p".format(name)
        colordict = load_obj(dict_fn)
        savefn = "{}_{}_force_pca_test.pdf".format(pno, name)
        network_plot(pno, adj, colordict, False, savefn)
        print "done with {}".format(name)
Esempio n. 5
0
def test2():
    db = MongoClient().patents
    family_names = [
        'stents', 'zeolites', 'bubblejet', 'cellphone', 'pcr', 'microarrays',
        'semiconductors', 'nonwovenwebs', 'rsa', 'browser'
    ]
    family_pnos = [
        4655771, 4061724, 4723129, 5103459, 4683202, 5143854, 4064521, 4340563,
        4405829, 5572643
    ]
    family_thresholds = [350, 60, 75, 225, 150, 175, 125, 100, 400, 250]
    lilfriend_names = [
        'skate', 'murphybed', 'hummingbirdfeeder', 'telescopicumbrella',
        'hybridengine', 'minesweeper', 'humanoidrobot', 'recumbentbike',
        'hangglider', 'ziplock'
    ]
    lilfriend_pnos = [
        6000704, 4766623, 5454348, 4880023, 5191766, 3938459, 6377014, 5284351,
        4417707, 6004032
    ]
    lilfriend_thresholds = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    bigfriend_names = [
        'dentallaser', 'ballisticvest', 'hungryhippos', 'sharkprod',
        'gatlinggun', 'nuclearwastetreatment', 'gfp', 'roughterrainchasis',
        'bowflex', 'radaraltimeter'
    ]
    bigfriend_pnos = [
        5616141, 4287607, 4119312, 4667431, 4154143, 4274976, 5491084, 4061199,
        4725057, 4945360
    ]
    bigfriend_thresholds = [25, 25, 10, 12, 8, 9, 25, 30, 15, 10]
    names = family_names + lilfriend_names + bigfriend_names
    pnos = family_pnos + lilfriend_pnos + bigfriend_pnos
    thresholds = family_thresholds + lilfriend_thresholds + bigfriend_thresholds
    names = names[:1]
    pnos = pnos[:1]
    thresholds = thresholds[:1]
    for pno, threshold, name in zip(pnos, thresholds, names):
        print "getting lineage for patent {} ({}), with threhold {}.".format(
            pno, name, threshold)
        lineage = crawl_lineage(
            db,
            pno,
            n_generations=5,
            enforce_func=lambda x: len(x.get('citedby', [])) >= threshold,
            flatten=True)
        adj = subnet_adj_dict(lineage)
        dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/{}_pca_dict.p'.format(
            name)
        colordict = load_obj(dict_fn)
        savefn = '{}_{}_force_pca_test.pdf'.format(pno, name)
        print "getting plot..."
        network_plot(pno, adj, colordict, False, savefn)
        print "done with {}".format(name)
Esempio n. 6
0
def _load_df():
    """ Loads a dictionary of document frequencies from disk. Assumes It lives at a particular location on disk. 
    TODO: throw this function (and all such functions) into alife.data
    """
    _data_dir = '/Users/jmenick/Desktop/alife_refactor/output/aggregate_stats'
    _df_fn = '/'.join([_data_dir, 'tfidf_doc_freq.p'])
    try:
        df_dict = load_obj(_df_fn)
    except:
        raise RuntimeError("A document frequency dictionary is not stored in {}.".format(_df_fn))
    return sorted(df_dict.items() ,key = lambda x: x[1])
def main():
    # Get filenames.
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name + '.lda'])
    corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_' + name + '.dict'])

    # Load persisted data from disk.
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p: i for i, p in enumerate(pnos)}

    #produce visualization... commented out for now due to crashing. Ugh PCA again...
    #    visfn = '/'.join([indir, 'vis.html'])
    #    vis_data = prepare(lda, corpus, vocab)
    #    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db.
    print "inserting doc topics..."
    db = MongoClient().patents
    print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus))

    def partfunc(doc):
        pno = doc['_id']
        try:
            corpus_idx = pno2id[pno]
            bow = corpus[corpus_idx]
            topics = lda[bow]
            return {'$set': {'lda_topics': topics}}
        except:
            logging.warning("no topics for {}".format(pno))
            return {'$set': {'no_topics': True}}

    parallelMap(partfunc,
                in_collection=db.traits,
                out_collection=db.traits,
                findArgs={
                    'spec': {},
                    'fields': {
                        '_id': 1
                    }
                },
                bSize=1000,
                updateFreq=500)
Esempio n. 8
0
def test():
    db = MongoClient().patents
    pno = 4723129
    threshold = 75
    print "getting lineage..."
    lineage = crawl_lineage(
        db, pno, n_generations=5, enforce_func=lambda x: len(x.get("citedby", [])) >= threshold, flatten=True
    )
    adj = subnet_adj_dict(lineage)
    bubblejet_color_dict_fn = "/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p"
    bubblejet_colors = load_obj(bubblejet_color_dict_fn)
    savefn = "{}_force_pca_test.pdf".format(pno)
    print "making plot..."
    network_plot(pno, adj, bubblejet_colors, True, savefn)
    return adj, bubblejet_colors
def main():
    # Get filenames. 
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name+'.lda'])
    corpusfn = '/'.join([indir, 'corpus_'+name+'.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_'+name+'.dict'])
        
    # Load persisted data from disk.
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p:i for i,p in enumerate(pnos)}

    #produce visualization... commented out for now due to crashing. Ugh PCA again...
#    visfn = '/'.join([indir, 'vis.html'])
#    vis_data = prepare(lda, corpus, vocab)
#    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db. 
    print "inserting doc topics..."
    db = MongoClient().patents
    print "len(corpus): {}, len(pnos): {}".format(len(pnos), len(corpus))
    def partfunc(doc):
        pno = doc['_id']
        try: 
            corpus_idx = pno2id[pno]
            bow = corpus[corpus_idx]
            topics = lda[bow]
            return {'$set': {'lda_topics': topics}}
        except:
            logging.warning("no topics for {}".format(pno))
            return {'$set': {'no_topics': True}}
    parallelMap(
        partfunc,
        in_collection = db.traits,
        out_collection = db.traits,
        findArgs = {
            'spec': {},
            'fields': {'_id':1}
        },
        bSize = 1000,
        updateFreq = 500
    )
Esempio n. 10
0
def test():
    db = MongoClient().patents
    pno = 4723129
    threshold = 75
    print "getting lineage..."
    lineage = crawl_lineage(
        db,
        pno,
        n_generations=5,
        enforce_func=lambda x: len(x.get('citedby', [])) >= threshold,
        flatten=True)
    adj = subnet_adj_dict(lineage)
    bubblejet_color_dict_fn = '/Users/jmenick/Desktop/sandbox/jacobs_pca_dicts/bubblejet_pca_dict.p'
    bubblejet_colors = load_obj(bubblejet_color_dict_fn)
    savefn = '{}_force_pca_test.pdf'.format(pno)
    print "making plot..."
    network_plot(pno, adj, bubblejet_colors, True, savefn)
    return adj, bubblejet_colors
def test(limit=100):
    # Get filenames.
    indir = '/Users/jmenick/Desktop/alife_refactor/output/lda_model_200'
    name = 'lda_200'
    pnofn = '/'.join([indir, 'pnos.p'])
    ldafn = '/'.join([indir, name + '.lda'])
    corpusfn = '/'.join([indir, 'corpus_' + name + '.svmlight'])
    vocabfn = '/'.join([indir, 'vocab_' + name + '.dict'])

    # Load persisted data from disk.
    print "loading data..."
    vocab = load_vocab(vocabfn)
    corpus = load_corpus(corpusfn)
    lda = load_lda(ldafn)
    pnos = load_obj(pnofn)
    pno2id = {p: i for i, p in enumerate(pnos)}

    #produce visualization... commented out for now. keeps crashing the machine.
    #    print "producing visualization..."
    #    visfn = '/'.join([indir, 'vis.html'])
    #    vis_data = prepare(lda, corpus, vocab)
    #    print "saving visualization..."
    #    pyLDAvis.save_html(vis_data, visfn)

    # put doc topics in db.
    print "Getting doc topics..."
    assert (len(corpus) == len(pnos))
    db = MongoClient().patents

    def partfunc(doc):
        topics = lda[corpus[pno2id[doc['_id']]]]
        return {'$set': {'lda_topics': topics}}

    pats_test = db.traits.find().limit(limit)
    for p in pats_test:
        pprint(partfunc(p))
    print "\nDone."
Esempio n. 12
0
def test():
    gpes = load_obj('gpes_tfidf.p')
    plot_gpe(gpes, savefn='gpes_tfidf_test.pdf')
Esempio n. 13
0
def test():
    gpes = load_obj('gpes_tfidf.p')
    plot_gpe(gpes, savefn='gpes_tfidf_test.pdf')
Esempio n. 14
0
def load_anc_dec(start_date, indir):
    """ Load two lists of patents; ancestral and descendant poulation respectively. """
    filename = '/'.join([indir, dt_as_str(start_date)+'.p'])
    pop_dict = load_obj(filename)
    assert(start_date == pop_dict['start']) # Make sure the date we think we're loading matches the stored date.
    return pop_dict['ancestors'], pop_dict['descendants']
Esempio n. 15
0
def load_pop(start_date):
    """ Load a list of patents (dictionaries) occuring in the month following the given start_date."""
    filename = '/'.join([_pop_dir, dt_as_str(start_date)+'.p'])
    pop_dict = load_obj(filename)
    assert(start_date == pop_dict['start']) # Make sure the date we think we're loading matches the stored date.
    return pop_dict['descendants']
Esempio n. 16
0
from alife.util.general import load_obj
import sys
import os
import csv

if __name__ == '__main__':
    if len(sys.argv) != 2:
        sys.exit("Usage: python {} <path to .p file>".format(sys.argv[0]))
    else:
        infn = sys.argv[1]
    inbase = os.path.basename(infn)
    outfn = inbase.split('.')[0] + '.csv'

    gpes = load_obj(infn)
    with open(outfn, 'wb') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total'])
        for trait, series in gpes.items():
            for step,term_list in enumerate(series):
                writer.writerow([trait, step]+list(term_list))