def read_edges_file(self): context = etree.iterparse(self.edges_filename, events=('end',), tag="edge") edges_iter = utils.fast_iter(context, self.handle_edges) count_dict = {} flist = [] prev_sender = None curr_sender = None new_friends = [] node_data = GoodreadsDataPreparser.NodeData(None, GoodreadsDataPreparser.interaction_types) counter = 0 max_core_nodes_index = self.node_counter-1 #assuming only core nodes were fetched before this for (sender_id, receiver_id), _ in edges_iter: curr_sender = sender_id # if this is a core user, makes sure that only edges of core users are read if sender_id in self.node_id_map and self.node_id_map[sender_id] <=max_core_nodes_index: if receiver_id not in self.node_id_map: nnode = self.create_network_node(self.node_counter, should_have_friends=False, should_have_interactions=True, node_data=node_data) new_friends.append((self.node_counter, nnode)) self.node_id_map[receiver_id] = self.node_counter self.node_counter += 1 if prev_sender is not None and curr_sender != prev_sender: self.nodes[self.node_id_map[prev_sender]].store_friends(flist) flist = [] new_edge = GoodreadsDataPreparser.EdgeData(self.node_id_map[receiver_id]) flist.append(new_edge) prev_sender = curr_sender counter += 1 if prev_sender in self.node_id_map and self.node_id_map[prev_sender] <=max_core_nodes_index: self.nodes[self.node_id_map[prev_sender]].store_friends(flist) # assigning unique incremented index to each new node for uid, node_obj in new_friends: self.nodes.insert(uid, node_obj) print "All edges stored", counter
def read_items_file(self): context = etree.iterparse(self.items_filename, events=('end',), tag="item") items_iter = utils.fast_iter(context, self.handle_items) self.items.insert(0, None) for itemid, itemdata in items_iter: self.items.insert(itemid, itemdata) print "all items stored", len(self.items) self.total_num_items = len(self.items) return self.items
def read_items_file(self): context = etree.iterparse(self.items_filename, events=('end', ), tag="item") items_iter = utils.fast_iter(context, self.handle_items) self.items.insert(0, None) for itemid, itemdata in items_iter: self.items.insert(itemid, itemdata) print "all items stored", len(self.items) self.total_num_items = len(self.items) return self.items
def read_nodes_file(self): context = etree.iterparse(self.nodes_filename, events=('end',), tag="user") nodes_iter = utils.fast_iter(context, self.handle_nodes) self.nodes.insert(self.node_counter, None) self.node_counter += 1 # to offset for a 1-based node id and zero-based list index for uid, node_data in nodes_iter: #assert self.node_counter == uid nnode = self.create_network_node(self.node_counter, should_have_friends=True, should_have_interactions=True, node_data=node_data) self.nodes.insert(self.node_counter, nnode) self.node_id_map[uid] = self.node_counter self.node_counter += 1 print "All core nodes stored", len(self.nodes)-1 return self.nodes
def read_edges_file(self): context = etree.iterparse(self.edges_filename, events=('end', ), tag="edge") edges_iter = utils.fast_iter(context, self.handle_edges) count_dict = {} flist = [] prev_sender = None curr_sender = None new_friends = [] node_data = GoodreadsDataPreparser.NodeData( None, GoodreadsDataPreparser.interaction_types) counter = 0 max_core_nodes_index = self.node_counter - 1 #assuming only core nodes were fetched before this for (sender_id, receiver_id), _ in edges_iter: curr_sender = sender_id # if this is a core user, makes sure that only edges of core users are read if sender_id in self.node_id_map and self.node_id_map[ sender_id] <= max_core_nodes_index: if receiver_id not in self.node_id_map: nnode = self.create_network_node( self.node_counter, should_have_friends=False, should_have_interactions=True, node_data=node_data) new_friends.append((self.node_counter, nnode)) self.node_id_map[receiver_id] = self.node_counter self.node_counter += 1 if prev_sender is not None and curr_sender != prev_sender: self.nodes[self.node_id_map[prev_sender]].store_friends( flist) flist = [] new_edge = GoodreadsDataPreparser.EdgeData( self.node_id_map[receiver_id]) flist.append(new_edge) prev_sender = curr_sender counter += 1 if prev_sender in self.node_id_map and self.node_id_map[ prev_sender] <= max_core_nodes_index: self.nodes[self.node_id_map[prev_sender]].store_friends(flist) # assigning unique incremented index to each new node for uid, node_obj in new_friends: self.nodes.insert(uid, node_obj) print "All edges stored", counter
def read_nodes_file(self): context = etree.iterparse(self.nodes_filename, events=('end', ), tag="user") nodes_iter = utils.fast_iter(context, self.handle_nodes) self.nodes.insert(self.node_counter, None) self.node_counter += 1 # to offset for a 1-based node id and zero-based list index for uid, node_data in nodes_iter: #assert self.node_counter == uid nnode = self.create_network_node(self.node_counter, should_have_friends=True, should_have_interactions=True, node_data=node_data) self.nodes.insert(self.node_counter, nnode) self.node_id_map[uid] = self.node_counter self.node_counter += 1 print "All core nodes stored", len(self.nodes) - 1 return self.nodes