def main_both(): """ Runs the GPE calculation for the whole population, for each docvec trait.""" db = MongoClient().patents oneweek = timedelta(days=7) mindate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', 1).limit(1))[0]['isd'] maxdate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', -1).limit(1))[0]['isd'] old_ancestors = [] gpes_tfidf = {} gpes_docvec = {} for (t1, _, _) in step_through_time(mindate, maxdate, oneweek): logging.info( "computing gpe for time {}, both tf-idf and w2v".format(t1)) gpe_dict_w2v, new_ancestors = gpe_multi_threaded( t1, 'w2v', _docvec_traits, None, old_ancestors) gpe_dict_tfidf, _ = gpe_multi( t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. gpes_docvec[t1] = gpe_dict_w2v[t1] gpes_tfidf[t1] = gpe_dict_tfidf[t1] old_ancestors = old_ancestors + new_ancestors return gpes_tfidf, gpes_docvec
def main_docvec(): """ Runs the GPE calculation for the whole population, for each docvec trait.""" db = MongoClient().patents oneweek = timedelta(days=7) mindate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', 1).limit(1))[0]['isd'] maxdate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = _docvec_traits old_ancestors = [] gpes = {} for (t1, _, _) in step_through_time(mindate, maxdate, oneweek): logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, None, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def test_docvec(time_limit=20, pop_limit=50000): """ Runs the GPE calculation on a subset of each population in the time window 1976-2014, for the tf-idf trait 'dna'. """ db = MongoClient().patents oneweek = timedelta(days=7) mindate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', 1).limit(1))[0]['isd'] maxdate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = _docvec_traits old_ancestors = [] gpes = {} for (t1, _, _) in step_through_time(mindate, maxdate, oneweek)[:time_limit]: logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, pop_limit, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def tester_mp(): db = MongoClient().patents oneweek = timedelta(days=7) mindate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', 1).limit(1))[0]['isd'] maxdate = list( db.traits.find({ 'isd': { '$exists': True } }).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = list(set(_docvec_traits)) old_ancestors = [] gpes = {} for (t1, _, _) in step_through_time(mindate, maxdate, oneweek)[:10]: logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi_threaded(t1, trait_type, traits, None, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def dump_populations(db, start, end, outdir, delta=timedelta(days=7), lim=100, debug=True): """ Step through time, maintaining the known ancestral population at each time step. Save each set of populations as a pickled dictionary.""" for (tm1, t, tp1) in step_through_time(start, end): new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim) precompute_doc = { '_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants) } if debug: precompute_doc['new_ancestors'] = len( precompute_doc['new_ancestors']) precompute_doc['descendants'] = len(precompute_doc['descendants']) pprint(precompute_doc) else: popfn = '/'.join([outdir, dt_as_str(tm1) + '.p']) print "pickling population for time {} as {}".format(tm1, popfn) print "#new ancestors: {}, #descendants:{}".format( len(precompute_doc['new_ancestors']), len(precompute_doc['descendants'])) pickle_obj(popfn, precompute_doc)
def dump_populations(db, start, end, outdir, delta=timedelta(days=7),lim=100, debug=True): """ Step through time, maintaining the known ancestral population at each time step. Save each set of populations as a pickled dictionary.""" for (tm1, t, tp1) in step_through_time(start, end): new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim) precompute_doc = {'_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants)} if debug: precompute_doc['new_ancestors'] = len(precompute_doc['new_ancestors']) precompute_doc['descendants'] = len(precompute_doc['descendants']) pprint(precompute_doc) else: popfn = '/'.join([outdir, dt_as_str(tm1)+'.p']) print "pickling population for time {} as {}".format(tm1, popfn) print "#new ancestors: {}, #descendants:{}".format(len(precompute_doc['new_ancestors']), len(precompute_doc['descendants'])) pickle_obj(popfn, precompute_doc)
def tester_mp(): db = MongoClient().patents oneweek = timedelta(days=7) mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = list(set(_docvec_traits)) old_ancestors = [] gpes = {} for (t1,_,_) in step_through_time(mindate,maxdate,oneweek)[:10]: logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi_threaded(t1, trait_type, traits, None, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def main_docvec(): """ Runs the GPE calculation for the whole population, for each docvec trait.""" db = MongoClient().patents oneweek = timedelta(days=7) mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = _docvec_traits old_ancestors = [] gpes = {} for (t1,_,_) in step_through_time(mindate,maxdate,oneweek): logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, None, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def test_docvec(time_limit=20, pop_limit = 50000): """ Runs the GPE calculation on a subset of each population in the time window 1976-2014, for the tf-idf trait 'dna'. """ db = MongoClient().patents oneweek = timedelta(days=7) mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] trait_type = 'w2v' traits = _docvec_traits old_ancestors = [] gpes = {} for (t1,_,_) in step_through_time(mindate,maxdate,oneweek)[:time_limit]: logging.info("computing gpe for time {}".format(t1)) gpe_dict, new_ancestors = gpe_multi(t1, trait_type, traits, pop_limit, old_ancestors) gpes[t1] = gpe_dict[t1] old_ancestors = old_ancestors + new_ancestors return gpes
def main_both(): """ Runs the GPE calculation for the whole population, for each docvec trait.""" db = MongoClient().patents oneweek = timedelta(days=7) mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd'] maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd'] old_ancestors = [] gpes_tfidf = {} gpes_docvec = {} for (t1,_,_) in step_through_time(mindate,maxdate,oneweek): logging.info("computing gpe for time {}, both tf-idf and w2v".format(t1)) gpe_dict_w2v, new_ancestors = gpe_multi_threaded(t1, 'w2v', _docvec_traits, None, old_ancestors) gpe_dict_tfidf, _ = gpe_multi(t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. gpes_docvec[t1] = gpe_dict_w2v[t1] gpes_tfidf[t1] = gpe_dict_tfidf[t1] old_ancestors = old_ancestors + new_ancestors return gpes_tfidf, gpes_docvec