def walk_down_graph(pno,depth,threshold,trait='top_tf-idf'): count = 0 p = collection.find_one({'_id':pno},{'_id':1, 'citedby':1, trait:1}) gens = [[p]] just_nodes = [p['_id']] node_gens = [[p['_id']]] links = [] for i in range(1,depth): parents = gens[i-1] next_gen = [] new_nodes = [] for par in parents: children_pnos = par['citedby'] children = collection.find({'_id': {"$in":children_pnos}}, {'_id':1, 'citedby':1, trait:1}) for child in list(children): if 'citedby' not in child.keys(): count += 1 elif len(child['citedby']) >= threshold: links.append((par['_id'],child['_id'])) # add only previously unseen nodes if child['_id'] not in just_nodes: next_gen.append(child) new_nodes.append(child['_id']) just_nodes.append(child['_id']) gens.append(next_gen) node_gens.append(new_nodes) print(count, 'without citedby') # get rid of the gens, just a list of records recs = [] count = 0 for gen in gens: recs += gen for rec in recs: # rename '_id' to 'pno' rec['pno'] = rec.pop('_id') # rename trait to 'traits' if trait in rec.keys(): rec['traits'] = rec.pop(trait) else: count += 1 rec['traits'] = [] count = round(float(count)/float(len(just_nodes)) * 100) print('%d%% without traits' % count) recs = et.recs_by_pno(recs) sparse = tm.sparse_matrix(just_nodes,recs,'traits') trait_dict = {} for i,pno in enumerate(just_nodes): trait_dict[pno] = sparse[i] return (just_nodes,node_gens,links,trait_dict)
def walk_down_graph(pno,depth,threshold,trait=None): p = collection.find_one({'pno':pno},{'pno':1, 'citedby':1, 'sorted_text':1}) gens = [[p]] just_nodes = [p['pno']] node_gens = [[p['pno']]] links = [] for i in range(1,depth): parents = gens[i-1] next_gen = [] new_nodes = [] for par in parents: children_pnos = par['citedby'] children = collection.find({'pno': {"$in":children_pnos}}, {'pno':1, 'citedby':1, 'sorted_text':1, 'text':1}) for child in list(children): if len(child['citedby']) >= threshold: links.append((par['pno'],child['pno'])) # add only previously unseen nodes if child['pno'] not in just_nodes: next_gen.append(child) new_nodes.append(child['pno']) just_nodes.append(child['pno']) gens.append(next_gen) node_gens.append(new_nodes) recs = [] for gen in gens: recs += gen # get the trait dict recs = et.recs_by_pno(recs) recs = et.trim_sorted_text(recs,10) sparse = tm.sparse_matrix(just_nodes,recs,'traits') trait_dict = {} for i,pno in enumerate(just_nodes): trait_dict[pno] = sparse[i] return (just_nodes,node_gens,links,trait_dict)
recs = network[3] primo = just_nodes[0] recs = et.recs_by_pno(recs) recs = et.trim_sorted_text(recs,n) p_by_c = et.parents_by_child(links) # first patent doesn't have any parents! we need to add it manually p_by_c[primo] = [] # get the primogenitors traits primo_traits = recs[primo]['words'] # the matrix is ordered by the order of pnos in just_nodes s_matrix = tm.sparse_matrix(just_nodes,recs,'words') traits_by_pno = tm.traits_by_pno(just_nodes,s_matrix) real_network = p_by_c real_traits = [primo_traits,traits_by_pno] traits_per_patent = n assigner = traits_emp.Real_traits(real_network = real_network,real_traits = real_traits, traits_per_patent = traits_per_patent) assigner.assign_traits() # new traits are ordered by pno just_nodes = sorted(just_nodes) # sparse matrix with new traits new_traits = assigner.phenomes