def walk_down_graph(pno,depth,threshold,trait='top_tf-idf'):
    count = 0
    p = collection.find_one({'_id':pno},{'_id':1, 'citedby':1, trait:1})
    gens = [[p]]
    just_nodes = [p['_id']]
    node_gens = [[p['_id']]]
    links = []
    
    for i in range(1,depth):
        parents = gens[i-1]
        next_gen = []
        new_nodes = []

        for par in parents:
            children_pnos = par['citedby']
            children = collection.find({'_id': {"$in":children_pnos}}, {'_id':1, 'citedby':1, trait:1})
            
            for child in list(children):
                if 'citedby' not in child.keys():
                    count += 1
                elif len(child['citedby']) >= threshold:
                    links.append((par['_id'],child['_id']))
                    # add only previously unseen nodes
                    if child['_id'] not in just_nodes:
                        next_gen.append(child)
                        new_nodes.append(child['_id'])
                        just_nodes.append(child['_id'])

        gens.append(next_gen)
        node_gens.append(new_nodes)
    print(count, 'without citedby')
    # get rid of the gens, just a list of records
    recs = []
    count = 0
    for gen in gens:
        recs += gen

    for rec in recs:
        # rename '_id' to 'pno' 
        rec['pno'] = rec.pop('_id')
        # rename trait to 'traits'
        if trait in rec.keys():
            rec['traits'] = rec.pop(trait)
        else:
            count += 1
            rec['traits'] = []

    count = round(float(count)/float(len(just_nodes)) * 100)
    print('%d%% without traits' % count)
    recs = et.recs_by_pno(recs)

    sparse = tm.sparse_matrix(just_nodes,recs,'traits')

    trait_dict = {}
    for i,pno in enumerate(just_nodes):
        trait_dict[pno] = sparse[i]



    return (just_nodes,node_gens,links,trait_dict)
Example #2
0
def walk_down_graph(pno,depth,threshold,trait=None):
    p = collection.find_one({'pno':pno},{'pno':1, 'citedby':1, 'sorted_text':1})
    gens = [[p]]
    just_nodes = [p['pno']]
    node_gens = [[p['pno']]]
    links = []
    
    for i in range(1,depth):
        parents = gens[i-1]
        next_gen = []
        new_nodes = []

        for par in parents:
            children_pnos = par['citedby']
            children = collection.find({'pno': {"$in":children_pnos}}, {'pno':1, 'citedby':1, 'sorted_text':1, 'text':1})
            
            for child in list(children):
                if len(child['citedby']) >= threshold:
                    links.append((par['pno'],child['pno']))
                    # add only previously unseen nodes
                    if child['pno'] not in just_nodes:
                        next_gen.append(child)
                        new_nodes.append(child['pno'])
                        just_nodes.append(child['pno'])

        gens.append(next_gen)
        node_gens.append(new_nodes)

    recs = []
    for gen in gens:
        recs += gen

    # get the trait dict
    recs = et.recs_by_pno(recs)
    recs = et.trim_sorted_text(recs,10)

    sparse = tm.sparse_matrix(just_nodes,recs,'traits')

    trait_dict = {}
    for i,pno in enumerate(just_nodes):
        trait_dict[pno] = sparse[i]


    return (just_nodes,node_gens,links,trait_dict)
import trait_matrix as tm
import ..dots


f = open('zeolites_network_5_60.p', 'rb')
network = pickle.load(f)
n = 10

just_nodes = network[0]
node_gens = network[1]
links = network[2]
recs = network[3]
primo = just_nodes[0]


recs = et.recs_by_pno(recs)
recs = et.trim_sorted_text(recs,n)

p_by_c = et.parents_by_child(links)
# first patent doesn't have any parents! we need to add it manually
p_by_c[primo] = []

# get the primogenitors traits
primo_traits = recs[primo]['words']

# the matrix is ordered by the order of pnos in just_nodes
s_matrix = tm.sparse_matrix(just_nodes,recs,'words')
traits_by_pno = tm.traits_by_pno(just_nodes,s_matrix)


real_network = p_by_c