Exemple #1
0
def main_both():
    """ Runs the GPE calculation for the whole population, for each docvec trait."""
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', -1).limit(1))[0]['isd']
    old_ancestors = []
    gpes_tfidf = {}
    gpes_docvec = {}
    for (t1, _, _) in step_through_time(mindate, maxdate, oneweek):
        logging.info(
            "computing gpe for time {}, both tf-idf and w2v".format(t1))
        gpe_dict_w2v, new_ancestors = gpe_multi_threaded(
            t1, 'w2v', _docvec_traits, None, old_ancestors)
        gpe_dict_tfidf, _ = gpe_multi(
            t1, 'tf-idf', _tfidf_traits, None,
            old_ancestors)  # multithreading Not worth overhead for tfidf.
        gpes_docvec[t1] = gpe_dict_w2v[t1]
        gpes_tfidf[t1] = gpe_dict_tfidf[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes_tfidf, gpes_docvec
Exemple #2
0
def main_docvec():
    """ Runs the GPE calculation for the whole population, for each docvec trait."""
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = _docvec_traits
    old_ancestors = []
    gpes = {}
    for (t1, _, _) in step_through_time(mindate, maxdate, oneweek):
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, None,
                                            old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #3
0
def test_docvec(time_limit=20, pop_limit=50000):
    """ Runs the GPE calculation on a subset of each population in the time window 1976-2014, for the tf-idf trait 'dna'. """
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = _docvec_traits
    old_ancestors = []
    gpes = {}
    for (t1, _, _) in step_through_time(mindate, maxdate,
                                        oneweek)[:time_limit]:
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, pop_limit,
                                            old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #4
0
def tester_mp():
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(
        db.traits.find({
            'isd': {
                '$exists': True
            }
        }).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = list(set(_docvec_traits))
    old_ancestors = []
    gpes = {}
    for (t1, _, _) in step_through_time(mindate, maxdate, oneweek)[:10]:
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi_threaded(t1, trait_type, traits,
                                                     None, old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #5
0
def dump_populations(db,
                     start,
                     end,
                     outdir,
                     delta=timedelta(days=7),
                     lim=100,
                     debug=True):
    """ Step through time, maintaining the known ancestral population at each time step. Save each 
    set of populations as a pickled dictionary."""
    for (tm1, t, tp1) in step_through_time(start, end):
        new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim)
        precompute_doc = {
            '_id': tm1,
            'new_ancestors': list(new_ancestors),
            'descendants': list(descendants)
        }
        if debug:
            precompute_doc['new_ancestors'] = len(
                precompute_doc['new_ancestors'])
            precompute_doc['descendants'] = len(precompute_doc['descendants'])
            pprint(precompute_doc)
        else:
            popfn = '/'.join([outdir, dt_as_str(tm1) + '.p'])
            print "pickling population for time {} as {}".format(tm1, popfn)
            print "#new ancestors: {}, #descendants:{}".format(
                len(precompute_doc['new_ancestors']),
                len(precompute_doc['descendants']))
            pickle_obj(popfn, precompute_doc)
def dump_populations(db, start, end, outdir, delta=timedelta(days=7),lim=100, debug=True):
    """ Step through time, maintaining the known ancestral population at each time step. Save each 
    set of populations as a pickled dictionary."""
    for (tm1, t, tp1) in step_through_time(start, end):
        new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim)
        precompute_doc = {'_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants)}
        if debug: 
            precompute_doc['new_ancestors'] = len(precompute_doc['new_ancestors'])
            precompute_doc['descendants'] = len(precompute_doc['descendants'])
            pprint(precompute_doc)
        else:
            popfn = '/'.join([outdir, dt_as_str(tm1)+'.p'])
            print "pickling population for time {} as {}".format(tm1, popfn)
            print "#new ancestors: {}, #descendants:{}".format(len(precompute_doc['new_ancestors']), len(precompute_doc['descendants']))
            pickle_obj(popfn, precompute_doc)
Exemple #7
0
def tester_mp():
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = list(set(_docvec_traits))
    old_ancestors = []
    gpes = {}
    for (t1,_,_) in step_through_time(mindate,maxdate,oneweek)[:10]:
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi_threaded(t1, trait_type, traits, None, old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #8
0
def main_docvec():
    """ Runs the GPE calculation for the whole population, for each docvec trait."""
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = _docvec_traits
    old_ancestors = []
    gpes = {}
    for (t1,_,_) in step_through_time(mindate,maxdate,oneweek):
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, None, old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #9
0
def test_docvec(time_limit=20, pop_limit = 50000):
    """ Runs the GPE calculation on a subset of each population in the time window 1976-2014, for the tf-idf trait 'dna'. """
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    trait_type = 'w2v'
    traits = _docvec_traits
    old_ancestors = []
    gpes = {}
    for (t1,_,_) in step_through_time(mindate,maxdate,oneweek)[:time_limit]:
        logging.info("computing gpe for time {}".format(t1))
        gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, pop_limit, old_ancestors)
        gpes[t1] = gpe_dict[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes
Exemple #10
0
def main_both():
    """ Runs the GPE calculation for the whole population, for each docvec trait."""
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    old_ancestors = []
    gpes_tfidf = {}
    gpes_docvec = {}
    for (t1,_,_) in step_through_time(mindate,maxdate,oneweek):
        logging.info("computing gpe for time {}, both tf-idf and w2v".format(t1))
        gpe_dict_w2v, new_ancestors = gpe_multi_threaded(t1, 'w2v', _docvec_traits, None, old_ancestors)
        gpe_dict_tfidf, _ = gpe_multi(t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. 
        gpes_docvec[t1] = gpe_dict_w2v[t1]
        gpes_tfidf[t1] = gpe_dict_tfidf[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes_tfidf, gpes_docvec