def main(): """ See usage message in module header block """ get_subgraph = False # if True discard nodes without attribute data try: opts, args = getopt.getopt(sys.argv[1:], "d") except: usage(sys.argv[0]) for opt, arg in opts: if opt == "-d": get_subgraph = True else: usage(sys.argv[0]) if len(args) != 1: usage(sys.argv[0]) data_dir = args[0] outputdir = '.' sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip' G = load_physician_referral_data(datazipfile) print time.time() - start, 's' snap.PrintInfo(G) # Remove loops (self-edges). # G is a PNGraph so multiple edges not allowed in this type anyway. snap.DelSelfEdges(G) snap.PrintInfo(G) # specify ordered nodelist to map sequential ids to original ids consistent nodelist = [node.GetId() for node in G.Nodes()] graph_filename = outputdir + os.path.sep + "physician_referall_arclist" + os.path.extsep + "txt" nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt" write_graph_file(graph_filename, G, nodelist) write_subgraph_nodeids(nodeid_filename, nodelist)
def main(): """ See usage message in module header block """ directed = True try: opts, args = getopt.getopt(sys.argv[1:], "") except: usage(sys.argv[0]) for opt, arg in opts: usage(sys.argv[0]) if len(args) != 5: usage(sys.argv[0]) data_dir = args[0] num_samples = int(args[1]) num_seeds = int(args[2]) num_waves = int(args[3]) - 1 # -1 for consistency with SPNet outputdir = args[4] print "directed:", directed print "number of samples:", num_samples print "number of seeds:", num_seeds print "number of waves:", num_waves print "output directory:", outputdir if not os.path.exists(outputdir): os.mkdir(outputdir) sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip' G = load_physician_referral_data(datazipfile) print time.time() - start, 's' snap.PrintInfo(G) # get num_samples * num_seeds distinct random seed nodes (sample without replacement) # and convert to list of lists where each list is seed set for one sample allseeds = random.sample([node.GetId() for node in G.Nodes()], num_samples * num_seeds) seedsets = [ allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds) ] sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f = open(sampledesc_filename, 'w') for i in range(num_samples): sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ') start = time.time() # have to convert seedset to TIntV for SNAP seedsVec = snap.TIntV() for nodeid in seedsets[i]: seedsVec.Add(nodeid) Gsample0 = snowball_sample(G, num_waves, seedsVec) #print 'XXX',Gsample0.GetIntAttrDatN(Gsample0.GetRndNId(), "zone")#XXX # renumber nodes so they are numbered 0..N-1 # Actually can't do this as it loses the node attributes (zone) # so instead build a dictionary mapping nodeid:zone # so that can be written to zone file in correct order. # Note that then the index in nodelist of a nodeid can be used # as sequential node number of each node. ##Gsample = snap.ConvertGraph(snap.PNEANet, Gsample0, True) #print 'YYY',Gsample.GetIntAttrDatN(Gsample.GetRndNId(), "zone")#XXX Gsample = Gsample0 nodelist = list( ) # keep this iteration in list so we always use same order in future zonedict = dict() # map nodeid : zone for node in Gsample.Nodes(): nodelist.append(node.GetId()) zonedict[node.GetId()] = Gsample.GetIntAttrDatN( node.GetId(), "zone") print time.time() - start, 's' snap.PrintInfo(Gsample) subgraph_filename = outputdir + os.path.sep + "subgraph" + str( i) + os.path.extsep + "txt" write_graph_file(subgraph_filename, Gsample, nodelist) subzone_filename = outputdir + os.path.sep + "subzone" + str( i) + os.path.extsep + "txt" write_zone_file(subzone_filename, Gsample, nodelist, zonedict) subactor_filename = outputdir + os.path.sep + "subactor" + str( i) + os.path.extsep + "txt" # TODO get actor attributes #write_subactors_file(subactor_filename, Gsample, nodelist) # format of sampledesc file is: # N subzone_filename subgraph_filename subactor_filename sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f.write("%d %s %s %s\n" % (Gsample.GetNodes(), subzone_filename, subgraph_filename, subactor_filename)) sampledesc_f.close()
def main(): """ See usage message in module header block """ get_subgraph = False # if True discard nodes without attribute data try: opts, args = getopt.getopt(sys.argv[1:], "d") except: usage(sys.argv[0]) for opt, arg in opts: if opt == "-d": get_subgraph = True else: usage(sys.argv[0]) if len(args) != 1: usage(sys.argv[0]) data_dir = args[0] outputdir = '.' sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() (G, patdata, colnames) = load_nber_patent_data(data_dir) print time.time() - start, 's' snap.PrintInfo(G) # Remove loops (self-edges). # There is actually for some reason one loop (patent id 5489070). # G is a PNGraph so multiple edges not allowed in this type anyway. snap.DelSelfEdges(G) snap.PrintInfo(G) # We do not add attributes to nodes as SNAP node attribute as # these seem to get lost by varoius operations including subgraph # that we need to use, so instead maintain them just in the # dictionary mapping the original node ids to the attributes - # fortunately the original node ids are maintained by # GetSubGraph() so we can used these to index the patdata # dictoinary in the subgraphs # Cannot do this: #patdata[:][colnames['COUNTRY']] = convert_to_int_cat(patdata[:][colnames['COUNTRY']]) # like factor in R # as get "TypeError: unhashable type" so have to do this instead: id_countries = [(k, p[colnames['COUNTRY']]) for (k, p) in patdata.iteritems()] id_countries_int = convert_to_int_cat([x[1] for x in id_countries]) for i in xrange(len(id_countries)): patdata[id_countries[i][0]][colnames['COUNTRY']] = id_countries_int[i] for attr in ['COUNTRY']: sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[attr]] for p in patdata.itervalues()].count('NA'), attr)) id_states = [(k, p[colnames['POSTATE']]) for (k, p) in patdata.iteritems()] id_states_int = convert_to_int_cat([x[1] for x in id_states]) for i in xrange(len(id_states)): patdata[id_states[i][0]][colnames['POSTATE']] = id_states_int[i] for attr in ['POSTATE']: sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[attr]] for p in patdata.itervalues()].count('NA'), attr)) # There are 3774768 unique patent identifiers in the citation data but # only 2923922 unique patent identifiers in the patent data (patdata). # The size of the set intersection of these patent ids is 2755865 # i.e. there is patent data for 73% of the patents in the citation network. # Presumably this is because the patdata (pat63_99.txt) contains all # utilit patents in the period 1963 to 1999 but the citation data # cit75_99.txt contains all US patent citations for utility patents # granted in the period 1975 to 1999, so there are patent ids in here # from earlier periods that are cited by patents in that period, # for which therefore we don't have the patent data (prior to 1963). # So we have to set the data for all patents in network that we have it # for, and the rest (27%) to NA. nodelist = list( ) # keep the iteration below in list so we always use same order in future if get_subgraph: # get subgraph induced by nodes that have patent data in the # pat63_99.txt file nodeVec = snap.TIntV() # nodelist in TIntV format for use in SNAP for node in G.Nodes(): patid = node.GetId() if patdata.has_key(patid): nodelist.append(patid) nodeVec.Add(patid) G = snap.GetSubGraph(G, nodeVec) print 'Subgraph with only nodes with patent attribute data:' snap.PrintInfo(G) else: # keep all the graph and just put NA for all data attributes citepatent_count = 0 patentdata_count = 0 for node in G.Nodes(): citepatent_count += 1 patid = node.GetId() nodelist.append(patid) #print citepatent_count, patentdata_count, patid #XXX if not patdata.has_key(patid): #print 'NA for ', patid #XXX patdata[patid] = len(colnames) * ["NA"] patdata[patid][ colnames['HASDATA']] = 0 # no data on this patent else: patentdata_count += 1 sys.stdout.write( "There are %d unique cited/citing patents of which %d (%f%%) have patent data\n" % (citepatent_count, patentdata_count, 100 * float(patentdata_count) / citepatent_count)) graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt" write_graph_file(graph_filename, G, nodelist) attributes_binary_filename = outputdir + os.path.sep + "patent_binattr" + os.path.extsep + "txt" attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr" + os.path.extsep + "txt" attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt" write_attributes_file_binary(attributes_binary_filename, G, nodelist, patdata, colnames) write_attributes_file_categorical(attributes_categorical_filename, G, nodelist, patdata, colnames) write_attributes_file_continuous(attributes_continuous_filename, G, nodelist, patdata, colnames) nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt" write_subgraph_nodeids(nodeid_filename, nodelist)
def main(): """ See usage message in module header block """ get_subgraph = False # if True discard nodes without attribute data try: opts,args = getopt.getopt(sys.argv[1:], "d") except: usage(sys.argv[0]) for opt,arg in opts: if opt == "-d": get_subgraph = True else: usage(sys.argv[0]) if len(args) != 1: usage(sys.argv[0]) data_dir = args[0] outputdir = '.' sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() (G, patdata, colnames) = load_epo_patent_data(data_dir) print time.time() - start, 's' snap.PrintInfo(G) # Remove loops (self-edges). # There is actually for some reason 92 nodes with self-loops # e.g. EP0021443 # G is a PNGraph so multiple edges not allowed in this type anyway. snap.DelSelfEdges(G) snap.PrintInfo(G) # We do not add attributes to nodes as SNAP node attribute as # these seem to get lost by varoius operations including subgraph # that we need to use, so instead maintain them just in the # dictionary mapping the original node ids to the attributes - # fortunately the original node ids are maintained by # GetSubGraph() so we can used these to index the patdata # dictoinary in the subgraphs # convert categorical attribute values to integers like factor in R for cat_colname in ['Language','Country']: catvalues = [(k, p[colnames[cat_colname]]) for (k,p) in patdata.iteritems()] catvalues_int = convert_to_int_cat([x[1] for x in catvalues]) for i in xrange(len(catvalues)): patdata[catvalues[i][0]][colnames[cat_colname]] = catvalues_int[i] sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[cat_colname]] for p in patdata.itervalues()].count('NA'), cat_colname)) # convert categorical set attribute values to integers like factor in R for set_colname in ['Classes','Sections']: setvalues = [(k, p[colnames[set_colname]]) for (k,p) in patdata.iteritems()] setvalues_int = convert_to_int_set([x[1].split(',') for x in setvalues]) for i in xrange(len(setvalues)): patdata[setvalues[i][0]][colnames[set_colname]] = setvalues_int[i] sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[set_colname]] for p in patdata.itervalues()].count('NA'), set_colname)) nodelist = list() # keep the iteration below in list so we always use same order in future if get_subgraph: # get subgraph induced by nodes that have patent data in the # pat63_99.txt file nodeVec = snap.TIntV() # nodelist in TIntV format for use in SNAP for node in G.Nodes(): patid = node.GetId() if patdata.has_key(patid): nodelist.append(patid) nodeVec.Add(patid) G = snap.GetSubGraph(G, nodeVec) print 'Subgraph with only nodes with patent attribute data:' snap.PrintInfo(G) else: # keep all the graph and just put NA for all data attributes citepatent_count = 0 patentdata_count = 0 for node in G.Nodes(): citepatent_count += 1 patid = node.GetId() nodelist.append(patid) #print citepatent_count, patentdata_count, patid #XXX if not patdata.has_key(patid): #print 'NA for ', patid #XXX patdata[patid] = len(colnames)*["NA"] else: patentdata_count += 1 sys.stdout.write("There are %d unique cited/citing patents of which %d (%f%%) have patent data\n" % (citepatent_count, patentdata_count, 100*float(patentdata_count)/citepatent_count)) graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt" write_graph_file(graph_filename, G, nodelist) attributes_binary_filename = outputdir + os.path.sep + "patent_binattr" + os.path.extsep + "txt" attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr" + os.path.extsep + "txt" attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt" attributes_set_filename = outputdir + os.path.sep + "patent_setattr" + os.path.extsep + "txt" write_attributes_file_binary(attributes_binary_filename, G, nodelist, patdata, colnames) write_attributes_file_categorical(attributes_categorical_filename, G, nodelist, patdata, colnames) write_attributes_file_continuous(attributes_continuous_filename, G, nodelist, patdata, colnames) write_attributes_file_set(attributes_set_filename, G, nodelist, patdata, colnames) nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt" write_subgraph_nodeids(nodeid_filename, nodelist) # write patent sections as original letters before converting to int # This cannot be used by EstimNetDirected but is useful to read in R # and factor there so that the original names are preserved sections_filename = outputdir + os.path.sep + "patent_string_categories" + os.path.extsep + "txt" attrnames = ['CPCsections','LanguageCode','CountryCode'] with open(sections_filename, 'w') as f: f.write(' '.join(attrnames) + '\n') for i in nodelist: for attrname in attrnames: val = patdata[i][colnames[attrname]] val = 'NA' if (val == 'NA' or val == 'XX') else val f.write(val) if attrname == attrnames[-1]: f.write('\n') else: f.write(' ' )
def main(): """ See usage message in module header block """ directed = True try: opts, args = getopt.getopt(sys.argv[1:], "") except: usage(sys.argv[0]) for opt, arg in opts: usage(sys.argv[0]) if len(args) != 5: usage(sys.argv[0]) data_dir = args[0] num_samples = int(args[1]) num_seeds = int(args[2]) num_waves = int(args[3]) - 1 # -1 for consistency with SPNet outputdir = args[4] print "directed:", directed print "number of samples:", num_samples print "number of seeds:", num_seeds print "number of waves:", num_waves print "output directory:", outputdir if not os.path.exists(outputdir): os.mkdir(outputdir) sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() (G, profile, colnames) = load_pokec_data(data_dir) print time.time() - start, 's' snap.PrintInfo(G) # We do not add attributes to nodes as SNAP node attribute as # these seem to get lost by varoius operations including subgraph # that we need to use, so instead maintain them just in the # dictionary mapping the original node ids to the attributes - # fortunately the original node ids are maintained by # GetSubGraph() so we can used these to index the profile # dictoinary in the subgraphs ## https://snap.stanford.edu/data/soc-pokec-readme.txt ## region: ## string, mostly regions in Slovakia (example: "zilinsky kraj, ## kysucke nove mesto" means county Zilina, town Kysucke Nove Mesto, ## Slovakia), some foreign countries (example: "zahranicie, ## zahranicie - nemecko" means foreign country Germany (nemecko)), ## some Czech regions (example: "ceska republika, cz - ostravsky ## kraj" means Czech Republic, county Ostrava (ostravsky kraj)) ## We just make this a factor, looking at the output written by print ## below, it looks reasonable, but is is only a categorical variable ## allowing us to tell if two users are in the same region or not. ## TODO we could recode this so that we can have different variables ## for being in a different country, major city, etc. # Cannot do this: #profile[:][colnames['region']] = convert_to_int_cat(profile[:][colnames['region']]) # like factor in R # as get "TypeError: unhashable type" so have to do this instead: id_regions = [(k, p[colnames['region']]) for (k, p) in profile.iteritems()] id_regions_int = convert_to_int_cat([x[1] for x in id_regions]) for i in xrange(len(id_regions)): profile[id_regions[i][0]][colnames['region']] = id_regions_int[i] for attr in ['region']: sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[attr]] for p in profile.itervalues()].count('NA'), attr)) # get num_samples * num_seeds distinct random seed nodes (sample without replacement) # and convert to list of lists where each list is seed set for one sample allseeds = random.sample([node.GetId() for node in G.Nodes()], num_samples * num_seeds) seedsets = [ allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds) ] sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f = open(sampledesc_filename, 'w') for i in range(num_samples): sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ') start = time.time() # have to convert seedset to TIntV for SNAP seedsVec = snap.TIntV() for nodeid in seedsets[i]: seedsVec.Add(nodeid) Gsample = snowball_sample(G, num_waves, seedsVec) nodelist = list( ) # keep this iteration in list so we always use same order in future zonedict = dict() # map nodeid : zone for node in Gsample.Nodes(): nodelist.append(node.GetId()) zonedict[node.GetId()] = Gsample.GetIntAttrDatN( node.GetId(), "zone") print time.time() - start, 's' snap.PrintInfo(Gsample) subgraph_filename = outputdir + os.path.sep + "subgraph" + str( i) + os.path.extsep + "txt" write_graph_file(subgraph_filename, Gsample, nodelist) subzone_filename = outputdir + os.path.sep + "subzone" + str( i) + os.path.extsep + "txt" write_zone_file(subzone_filename, Gsample, nodelist, zonedict) subactor_binary_filename = outputdir + os.path.sep + "subactorbin" + str( i) + os.path.extsep + "txt" subactor_categorical_filename = outputdir + os.path.sep + "subactorcat" + str( i) + os.path.extsep + "txt" subactor_continuous_filename = outputdir + os.path.sep + "subactorcont" + str( i) + os.path.extsep + "txt" write_subactors_file_binary(subactor_binary_filename, Gsample, nodelist, profile, colnames) write_subactors_file_categorical(subactor_categorical_filename, Gsample, nodelist, profile, colnames) write_subactors_file_continuous(subactor_continuous_filename, Gsample, nodelist, profile, colnames) nodeid_filename = outputdir + os.path.sep + "subnodeid" + str( i) + os.path.extsep + "txt" write_subgraph_nodeids(nodeid_filename, nodelist) # format of sampledesc file is: # N subzone_filename subgraph_filename binary_Filename cat_filename cont_filename sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f.write( "%d %s %s %s %s %s\n" % (Gsample.GetNodes(), subzone_filename, subgraph_filename, subactor_binary_filename, subactor_categorical_filename, subactor_continuous_filename)) sampledesc_f.close()