Exemple #1
0
def dump_populations(db,
                     start,
                     end,
                     outdir,
                     delta=timedelta(days=7),
                     lim=100,
                     debug=True):
    """ Step through time, maintaining the known ancestral population at each time step. Save each 
    set of populations as a pickled dictionary."""
    for (tm1, t, tp1) in step_through_time(start, end):
        new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim)
        precompute_doc = {
            '_id': tm1,
            'new_ancestors': list(new_ancestors),
            'descendants': list(descendants)
        }
        if debug:
            precompute_doc['new_ancestors'] = len(
                precompute_doc['new_ancestors'])
            precompute_doc['descendants'] = len(precompute_doc['descendants'])
            pprint(precompute_doc)
        else:
            popfn = '/'.join([outdir, dt_as_str(tm1) + '.p'])
            print "pickling population for time {} as {}".format(tm1, popfn)
            print "#new ancestors: {}, #descendants:{}".format(
                len(precompute_doc['new_ancestors']),
                len(precompute_doc['descendants']))
            pickle_obj(popfn, precompute_doc)
Exemple #2
0
def main(infn, outfn, verbose = True):
    
    trait_gpes = defaultdict(list, [])
    with open(infn, 'r') as infile:
        parsed = [x for x in (parseline(line) for line in infile) if x is not None]
    for trait,gpes in parsed:
        if verbose:
            print "{}: {}".format(trait, gpes)
        trait_gpes[trait].append(gpes)
    pickle_obj(outfn, trait_gpes)
def get_and_save_community_colors(db, pnos, thresholds=None):
    community_colors_list = []
    if thresholds is None:
        thresholds = [0 for _ in pnos]
    for pno,threshold in zip(pnos,thresholds):
        viz_fn = 'viz_'+str(pno)+'.pdf'
        lookup_fn = 'lookup_'+str(pno)+'.p'
        color_lookup = community_colors(db, pno, threshold, show_vis=False, savefn=viz_fn)
        community_colors_list.append(color_lookup)
        pickle_obj(lookup_fn, color_lookup)
    return community_colors_list
Exemple #4
0
 def save(self, outdir, just_lda=False):
     """ save all files"""
     if not just_lda:
         pnofn = '/'.join([outdir, 'pnos.p'])
         vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict'])
         corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight'])
         if self.pnos is not None:
             pickle_obj(pnofn, self.pnos)
         self.vocab.save(vocabfn)
         corpora.SvmLightCorpus.serialize(corpusfn, self.corpus)
     ldafn = '/'.join([outdir, self.name + '.lda'])
     self._lda_model.save(ldafn)
Exemple #5
0
 def save(self, outdir, just_lda=False):
     """ save all files"""
     if not just_lda:
         pnofn = '/'.join([outdir, 'pnos.p'])
         vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict'])
         corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight'])
         if self.pnos is not None:
             pickle_obj(pnofn, self.pnos)
         self.vocab.save(vocabfn)
         corpora.SvmLightCorpus.serialize(corpusfn, self.corpus)
     ldafn = '/'.join([outdir,self.name+'.lda'])
     self._lda_model.save(ldafn)
def test():
    db = MongoClient().patents
    all_pairs = db.just_cites.find()
    N = all_pairs.count() 
    # Get counters for each number of shared traits and store as pickled dict. 
    real_shares_pairs = (nshared_tfidf(*p) for p in get_cite_pairs(N))
    real_shares_pairs_counts = Counter(real_shares_pairs)
    real_N = np.sum((x for x in real_shares_pairs.values()))
    pickle_obj('real_shares_counter_pairs.p', dict(real_shares_pairs_counts))
    rand_shares_pairs = (nshared_tfidf(*p) for p in get_rand_pairs(N))
    rand_shares_pairs_counts = Counter(rand_shares_pairs)
    rand_N = np.sum((x for x in rand_shares_pairs_counts))
    pickle_obj('rand_shares_counter_pairs.p', dict(rand_shares_pairs_counts))
Exemple #7
0
def main_noncum(name, mark=False):
    db = MongoClient().patents
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    
#    tfidf_traits = _tfidf_traits
    tfidf_traits = list(set(freq_prop_sample(3500)+_tfidf_traits))
 
    docvec_traits = range(200) # each cluster is a docvec trait

    # Runs the GPE calculation for TFIDF
    logging.info("starting with tfidf...")
    gpes_tfidf = run_gpe_parmap_noncum(db, 'tf-idf', tfidf_traits,
                                       mindate.year, maxdate.year, mark=mark)
    
    # Serialize the GPE results as a pickled python dictionary.
    pickle_fn = name+'gpes_tfidf_3k.p'
    logging.info("done. pickling in {}...".format(pickle_fn))
    pickle_obj(pickle_fn, gpes_tfidf)

    # Save the computed GPE terms as csv.
    csv_fn = name+'gpes_tfidf_3k.csv'
    logging.info("saving as csv in {}...".format(csv_fn))
    with open(csv_fn, 'wb') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total'])
        for trait, series in gpes_tfidf.items():
            for step,term_list in enumerate(series):
                writer.writerow([trait, step]+list(term_list))

    # Runs the GPE calculation for docvec
    logging.info("now for docvec...")
    gpes_docvec = run_gpe_parmap_noncum(db, 'w2v', docvec_traits,
                                 mindate.year, maxdate.year, mark=mark)

    # Serialize the GPE results as a pickled python dictionary.
    logging.info("saving as pickle...")
    pickle_obj(name+'gpes_docvec.p', gpes_docvec)
    
    # Save the computed GPE terms as csv.
    logging.info("done. saving as csv.")
    with open(name+'gpes_docvec.csv', 'wb') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['trait', 'time_step', 't1', 't2', 't3', 'total'])
        for trait, series in gpes_docvec.items():
            for step,term_list in enumerate(series):
                writer.writerow([trait, step]+list(term_list))

    return gpes_tfidf, gpes_docvec
def dump_populations(db, start, end, outdir, delta=timedelta(days=7),lim=100, debug=True):
    """ Step through time, maintaining the known ancestral population at each time step. Save each 
    set of populations as a pickled dictionary."""
    for (tm1, t, tp1) in step_through_time(start, end):
        new_ancestors, descendants = get_populations(db, tm1, t, tp1, lim)
        precompute_doc = {'_id': tm1, 'new_ancestors': list(new_ancestors), 'descendants': list(descendants)}
        if debug: 
            precompute_doc['new_ancestors'] = len(precompute_doc['new_ancestors'])
            precompute_doc['descendants'] = len(precompute_doc['descendants'])
            pprint(precompute_doc)
        else:
            popfn = '/'.join([outdir, dt_as_str(tm1)+'.p'])
            print "pickling population for time {} as {}".format(tm1, popfn)
            print "#new ancestors: {}, #descendants:{}".format(len(precompute_doc['new_ancestors']), len(precompute_doc['descendants']))
            pickle_obj(popfn, precompute_doc)
Exemple #9
0
def main():
    db = MongoClient().patents
    in_deg_counts, out_deg_counts = in_and_out_counts(db, None)
    pickle_obj('in_deg_counts.p', dict(in_deg_counts))
    pickle_obj('out_deg_counts.p', dict(out_deg_counts))
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    f.set_size_inches(18.5, 10.5)
    ax1.hist(in_deg_counts.keys(), weights=in_deg_counts.values(), bins=100)
    ax1.set_xlabel('In-Degree')
    ax1.set_ylabel('Count')
    ax2.hist(out_deg_counts.keys(), weights=out_deg_counts.values(), bins=100)
    ax2.set_xlabel('Out-Degree')
    ax2.set_ylabel('Count')
    plt.suptitle('Degree Distributions')
    plt.savefig('degree_distributions.png')
Exemple #10
0
def main():
    db = MongoClient().patents
    in_deg_counts, out_deg_counts = in_and_out_counts(db, None)
    pickle_obj('in_deg_counts.p', dict(in_deg_counts))
    pickle_obj('out_deg_counts.p', dict(out_deg_counts))
    f,(ax1, ax2) = plt.subplots(1,2,sharey=True)
    f.set_size_inches(18.5, 10.5)
    ax1.hist(in_deg_counts.keys(), weights=in_deg_counts.values(), bins=100)
    ax1.set_xlabel('In-Degree')
    ax1.set_ylabel('Count')
    ax2.hist(out_deg_counts.keys(), weights=out_deg_counts.values(), bins=100)
    ax2.set_xlabel('Out-Degree')
    ax2.set_ylabel('Count')
    plt.suptitle('Degree Distributions')
    plt.savefig('degree_distributions.png')
Exemple #11
0
def dump_pops_over_time(db, time_pairs, outdir, limit=None, mark = False):
    times = []
    popsizes = []
    for (time_0, time_1) in time_pairs:
        if mark:
            ancestors, new_descendants = map(list, get_anc_dec_mark(db, time_0, time_1, limit))
        else:
            ancestors, new_descendants = map(list, get_anc_dec_noncum(db, time_0, time_1, limit))
        precompute_doc = {'start': time_0, 'ancestors': ancestors, 'descendants': new_descendants}
        times.append(time_0)
        popsizes.append((len(ancestors), len(new_descendants)))
        popfn = '/'.join([outdir, dt_as_str(time_0)+'.p'])
        print "pickling population for time {} as {}".format(time_0, popfn)
        pickle_obj(popfn, precompute_doc)
    return times, popsizes
Exemple #12
0
def dump_descendants_over_time(db, time_pairs, outdir, limit = None, debug = True):
    # also returns a histogram of pop sizes.
    times = []
    popsizes = []
    for (time_0, time_1) in time_pairs:
        new_descendants = list(get_new_descendants(db, time_0, time_1, limit))
        precompute_doc = {'start': time_0, 'descendants': new_descendants}
        print "number of descendants at time {}: {}".format(time_0, len(new_descendants))
        times.append(time_0)
        popsizes.append(len(new_descendants))
        if debug:
            precompute_doc['descendants'] = len(new_descendants)
            pprint(precompute_doc)
        else:
            popfn = '/'.join([outdir, dt_as_str(time_0)+'.p'])
            print "pickling population for time {} as {}".format(time_0, popfn)
            pickle_obj(popfn, precompute_doc)
    return times, popsizes
Exemple #13
0
def main_both():
    """ Runs the GPE calculation for the whole population, for each docvec trait."""
    db = MongoClient().patents
    oneweek = timedelta(days=7)
    mindate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', 1).limit(1))[0]['isd']
    maxdate = list(db.traits.find({'isd': {'$exists': True}}).sort('isd', -1).limit(1))[0]['isd']
    old_ancestors = []
    gpes_tfidf = {}
    gpes_docvec = {}
    for (t1,_,_) in step_through_time(mindate,maxdate,oneweek):
        logging.info("computing gpe for time {}, both tf-idf and w2v".format(t1))
        gpe_dict_w2v, new_ancestors = gpe_multi_threaded(t1, 'w2v', _docvec_traits, None, old_ancestors)
        gpe_dict_tfidf, _ = gpe_multi(t1, 'tf-idf', _tfidf_traits, None, old_ancestors) # multithreading Not worth overhead for tfidf. 
        gpes_docvec[t1] = gpe_dict_w2v[t1]
        gpes_tfidf[t1] = gpe_dict_tfidf[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes_tfidf, gpes_docvec
    
if __name__ == '__main__':
    gpes_tfidf, gpes_docvec = main_both()
    try:
        pickle_obj('gpes_docvec.p', gpes_docvec)
        pickle_obj('gpes_tfidf.p', gpes_tfidf)
    except:
        logging.info("error.")
        logging.info("docvec gpes:")
        pprint(gpes_docvec)
        logging.info("tfidf gpes.")
        pprint(gpes_tfidf)
Exemple #14
0
            }
        }).sort('isd', -1).limit(1))[0]['isd']
    old_ancestors = []
    gpes_tfidf = {}
    gpes_docvec = {}
    for (t1, _, _) in step_through_time(mindate, maxdate, oneweek):
        logging.info(
            "computing gpe for time {}, both tf-idf and w2v".format(t1))
        gpe_dict_w2v, new_ancestors = gpe_multi_threaded(
            t1, 'w2v', _docvec_traits, None, old_ancestors)
        gpe_dict_tfidf, _ = gpe_multi(
            t1, 'tf-idf', _tfidf_traits, None,
            old_ancestors)  # multithreading Not worth overhead for tfidf.
        gpes_docvec[t1] = gpe_dict_w2v[t1]
        gpes_tfidf[t1] = gpe_dict_tfidf[t1]
        old_ancestors = old_ancestors + new_ancestors
    return gpes_tfidf, gpes_docvec


if __name__ == '__main__':
    gpes_tfidf, gpes_docvec = main_both()
    try:
        pickle_obj('gpes_docvec.p', gpes_docvec)
        pickle_obj('gpes_tfidf.p', gpes_tfidf)
    except:
        logging.info("error.")
        logging.info("docvec gpes:")
        pprint(gpes_docvec)
        logging.info("tfidf gpes.")
        pprint(gpes_tfidf)