def CC_network(in_dir, out_dir, verbose): ## INPUT DATA if verbose: print "..Initialize" src1 = os.path.join(in_dir, "articles.dat") src5 = os.path.join(in_dir, "references.dat") Ymin = 2100 Ymax = 1900 pl = Utils.Article() pl.read_file(src1) nb_art = len(pl.articles) art_table = dict() for i in range(nb_art): art_table[i] = [] doc_table = dict() id = 0 for l in pl.articles: doc_table[id] = dict() doc_table[id]['firstAU'] = l.firstAU doc_table[id]['year'] = l.year doc_table[id]['journal'] = l.journal doc_table[id]['citation'] = l.times_cited doc_table[id]['title'] = l.title doc_table[id]['de_keywords'] = l.de_keywords doc_table[id]['id_keywords'] = l.id_keywords doc_table[id]['abstract'] = l.abstract id = id + 1 for l in pl.articles: if (l.year > 1900 and l.year < 2000): if (l.year > Ymax): Ymax = l.year if (l.year < Ymin): Ymin = l.year if verbose: print "..Create Co-citation Network weight table" if verbose: print "....loading refs table" pl = Utils.Ref() pl.read_file(src5) nb_total_refs = len(pl.refs) CC_table = dict() nA = dict() ref_index = dict() for l in pl.refs: foo = l.firstAU + ', ' + str( l.year) + ', ' + l.journal + ', ' + l.volume + ', ' + l.page if l.refid not in ref_index: ref_index[l.refid] = dict() ref_index[l.refid]['firstAU'] = l.firstAU ref_index[l.refid]['year'] = l.year ref_index[l.refid]['journal'] = l.journal ref_index[l.refid]['volume'] = l.volume ref_index[l.refid]['page'] = l.page ref_index[l.refid]['doi'] = l.doi ref_index[l.refid]['article'] = [] ref_index[l.refid]['SubCommID'] = -1 ref_index[l.refid]['modularity'] = -1 ref_index[l.refid]['article'].append(l.id) art_table[l.id].append(l.refid) if l.refid not in nA: nA[l.refid] = 1 else: nA[l.refid] += 1 nb_refs = len(nA) if verbose: print "....detect common articles" for foo in art_table: if (len(art_table[foo]) > 1): for i in art_table[foo]: for j in art_table[foo]: if (i < j): if i not in CC_table: CC_table[i] = dict() if j not in CC_table[i]: CC_table[i][j] = 0 CC_table[i][j] += 1 # choose threshold confirm = 'n' thr = 5 while confirm != 'y': if thr == 1: print "Keep BC links between articles sharing at least %d reference" % ( thr) else: print "Keep BC links between articles sharing at least %d references" % ( thr) confirm = raw_input("Confirm (y/n): ") while confirm not in ['n', 'y']: confirm = raw_input("...typing error!\n Confirm (y/n): ") if confirm == 'n': thr = input( "threshold for BC links -- articles should be share at least ? references:" ) ccthr = thr confirm = 'n' ref_journal_list = [ 'J OPER MANA IN PRESS', 'J OPER MANAG', 'J OPER MANAG FORTHCO', 'J OPERATIONS MANAGE', 'J OPERATIONS MANAGEM', 'J. Oper. Manag.', 'Journal of Operations Management', 'M&SOM-MANUF SERV OP', 'MANUF SERV IN PRESS', 'MANUF SERV OPER MANA', 'MANUF SERV OPERAT MA', 'MANUF SERVICE OPERAT', 'Manufacturing & Service Operations Management', 'MANUFACTURING SERVIC', 'PROD OPER M IN PRESS', 'PROD OPER MANAG', 'PROD OPERAT MANAGEM', 'Production and Operations Management', 'PRODUCTION OPER MANA', 'Production Oper. Management', 'PRODUCTION OPERATION', 'PRODUCTIONS OPERATIO' ] ref_journal_flag = False print "Do you want the journal of references belong to the list below?" for foo in ref_journal_list: print foo confirm = raw_input("Confirm (y/n): ") while confirm not in ['n', 'y']: confirm = raw_input("...typing error!\n Confirm (y/n): ") if confirm == 'y': ref_journal_flag = True ############################## ## BC COMMUNITIES if verbose: print "..CC communities" #... define BC network if verbose: print "....define graph in networkx format" G = nx.Graph() for i in CC_table: for j in CC_table[i]: if ((not ref_journal_flag) or (ref_journal_flag and ref_index[i]['journal'] in ref_journal_list and ref_index[j]['journal'] in ref_journal_list)) and (CC_table[i][j] >= thr): w_ij = (1.0 * CC_table[i][j]) / math.sqrt(nA[i] * nA[j]) G.add_edge(i, j, weight=w_ij) nx.draw_spring(G) dst = os.path.join( out_dir, 'CC-Network(ccthr=%d, thr=%d, ref_journal_flag=%s).png' % (ccthr, thr, ref_journal_flag)) plt.savefig(dst) plt.close('all') #... if verbose: print "....computing communities with Louvain algo" dendogram = community.generate_dendogram(G, part_init=None) #... output infos print "....There are %d references in the database (contain duplicates)" % ( nb_total_refs) print "....There are %d references in the database (contain no duplicate)" % ( nb_refs) print "....There are %d references in the CC network\n......(ie sharing at least %d article(s) with another reference)" % ( len(G.nodes()), ccthr) for level in range(len(dendogram)): part = community.partition_at_level(dendogram, level) mod = community.modularity(part, G) nb_comm = len(set(part.values())) size_sup10 = 0 size_sup100 = 0 #communities_caracteristics(partition, thr, level) for com in set(part.values()): list_nodes = [nodes for nodes in part.keys() if part[nodes] == com] if len(list_nodes) > 100: size_sup100 += 1 if len(list_nodes) > 10: size_sup10 += 1 print "....level %d: %d communities [%d with size > 10, %d with size > 100], modularity Q=%1.6f" % ( level, nb_comm, size_sup10, size_sup100, mod) ############################## ## WHICH EXTRACTION ? print "..CC communities extraction" # confirm = 'n' level = len(dendogram) - 1 thr = 0 while confirm != 'y': part = community.partition_at_level(dendogram, level) nb_comm = len(set(part.values())) size_sup_thr = 0 n_sup_thr = 0 for com in set(part.values()): list_nodes = [nodes for nodes in part.keys() if part[nodes] == com] if len(list_nodes) > thr: n_sup_thr += len(list_nodes) size_sup_thr += 1 print "....Extraction of level %d CC communities with size > %d\n......(%d articles gathered in %d communities):" % ( level, thr, n_sup_thr, size_sup_thr) confirm = raw_input("....do you confirm? (y/n): ") if confirm == 'n': level = input("......level you want to extract:") thr = input("......keep communities of size > to:") #... partition partition = community.partition_at_level(dendogram, level) list_nodes = dict() for com in set(partition.values()): list_nodes[com] = [ nodes for nodes in partition.keys() if partition[nodes] == com ] ############################# # sub-community partition subcomm = dict() for com in list_nodes: # plot SubGraph for each community if verbose: print "....plot SubGraph for community %d" % (com) subG = nx.subgraph(G, list_nodes[com]) nx.draw_spring(subG) dst = os.path.join(out_dir, 'SubGraph/Plot/SubGraph-%d.png' % (com)) plt.savefig(dst) plt.close('all') # partition if verbose: print "....sub clustering for community %d" % (com) part = community.best_partition(subG) # basic descriptive statistics comm_size = len(subG.nodes()) nb_comm = len(set(part.values())) subcomm[com] = dict() subcomm[com]['nb_comm'] = nb_comm subcomm[com]['size'] = comm_size mod = community.modularity(part, subG) # record each node's sub community id for refid in part.keys(): ref_index[refid]['SubCommID'] = part[refid] ref_index[refid]['modularity'] = mod if verbose: print "......comm_size:%d, nb_comm:%d, modularity:%1.6f" % ( comm_size, nb_comm, mod) # output gephi files if verbose: print "......generate gephi files for sub-community %d" % (com) name = "SubGraph/Gephi/SubCCnetwork%d(ccthr=%d, thr=%d, ref_journal_flag=%s).gdf" % ( com, ccthr, thr, ref_journal_flag) dst = os.path.join(out_dir, name) f_gephi = open(dst, 'w') # nodes f_gephi.write( "nodedef>name VARCHAR,label VARCHAR,CCcom VARCHAR, Sub CCcom VARCHAR, Modularity VARCHAR, firstAU VARCHAR,journal VARCHAR,year VARCHAR,nb_arts DOUBLE,doi VARCHAR, volume VARCHAR, page VARCHAR\n" ) for refid in part.keys(): foo = ref_index[refid]['firstAU'] + ', ' + ref_index[refid][ 'journal'] + ', ' + str(ref_index[refid]['year']) f_gephi.write( "%d,'%s',%s,%s,%1.6f,%s,%s,%d,%d,%s,%s,%s\n" % (refid, foo, str(com), str(ref_index[refid]['SubCommID']), ref_index[refid]['modularity'], ref_index[refid]['firstAU'], ref_index[refid]['journal'], ref_index[refid]['year'], nA[refid], ref_index[refid]['doi'], ref_index[refid]['volume'], ref_index[refid]['page'])) # edges f_gephi.write( "edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE,nb_comm_refs DOUBLE" ) for i in part.keys(): for j in part.keys(): if (i < j): if i in CC_table: if j in CC_table[i]: w_ij = (1.0 * CC_table[i][j]) / math.sqrt( nA[i] * nA[j]) f_gephi.write("\n%d,%d,%f,%d" % (i, j, w_ij, CC_table[i][j])) # end f_gephi.close() #.. comm_size comm_size = dict() for com in list_nodes: size = len(list_nodes[com]) comm_size[com] = size # sort community by its size comm_size = dict() for com in list_nodes: size = len(list_nodes[com]) comm_size[com] = size Lcomm_size = comm_size.items() Lcomm_size.sort(cmpval) ############################## # Research Base CSV files if verbose: print "..Research Base CSV files generating" filename = os.path.join(out_dir, "ResearchBase.dat") f_out = open(filename, "w") # header line f_out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ('CommunityID', 'SubCommunityID', 'Modularity', 'Topic', 'SubTopic', 'RefID', 'Volume', 'Page', 'Lable', 'Title', 'Keywords', 'firstAU', 'Journal', 'Year', 'Citation', 'DOI')) for elm in Lcomm_size: com = elm[0] for ref in list_nodes[com]: foo = ref_index[ref]['firstAU'] + ', ' + ref_index[ref][ 'journal'] + ', ' + str(ref_index[ref]['year']) f_out.write( "%s\t%s\t%1.6f\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (str(com), str(ref_index[ref]['SubCommID']), ref_index[ref]['modularity'], '', '', str(ref), str(ref_index[ref]['volume']), str(ref_index[ref]['page']), foo, '', '', ref_index[ref]['firstAU'], ref_index[ref]['journal'], str(ref_index[ref]['year']), str(nA[ref]), ref_index[ref]['doi'])) f_out.close() if verbose: print "..Done!\n" ############################## # Research Front CSV files if verbose: print "..Research Front CSV files generating" filename = os.path.join(out_dir, "ResearchFront.dat") f_out = open(filename, "w") # header line f_out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ('CommunityID', 'SubCommunityID', 'RefID', 'DocID', 'DocLable', 'Title', 'Year', 'Citation', 'DE-Keywords', 'ID-Keywords', 'Abstract')) for elm in Lcomm_size: com = elm[0] for ref in list_nodes[com]: for doc in ref_index[ref]['article']: foo = doc_table[doc]['firstAU'] + ', ' + doc_table[doc][ 'journal'] + ', ' + str(doc_table[doc]['year']) f_out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (str(com), str(ref_index[ref]['SubCommID']), str(ref), str(doc), foo, doc_table[doc]['title'], str(doc_table[doc]['year']), str(doc_table[doc]['citation']), doc_table[doc]['de_keywords'], doc_table[doc]['id_keywords'], doc_table[doc]['abstract'])) f_out.close() if verbose: print "..Done!\n" ############################# # OUTPUT GEPHI FILES #... output gephi # if verbose: print "..Preparing gephi gdf file for CC communities network" # ... ini # name = "CC_comm_level%d(ccthr=%d, thr=%d, ref_journal_flag=%s).gdf" % (level,ccthr,thr,ref_journal_flag) # dst = os.path.join(out_dir, name) # f_gephi = open(dst,'w') # ... prep nodes # if verbose: print "....nodes" # f_gephi.write("nodedef>name VARCHAR,label VARCHAR,size DOUBLE,inv_innerweight DOUBLE\n") # for com in comm_size: # if (comm_size[com] > thr) and (com in comm_label): f_gephi.write("%d,'%s',%d,%1.0f\n" % (com, comm_label[com], comm_size[com], comm_innerw[com]) ) # ... prep edges # if verbose: print "....edges" # f_gephi.write("edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE,logweight DOUBLE\n") # for com1 in list_nodes: # for com2 in list_nodes: # size1 = len(list_nodes[com1]); size2 = len(list_nodes[com2]); # if size1 > thr and size2 > thr and com1 > com2: # W = 0; # for id1 in list_nodes[com1]: # for id2 in list_nodes[com2]: # if id2 in G.edge[id1]: # W += G.edge[id1][id2]['weight'] # W *= 1000.0 / (size1 * size2) # if W > 0.000001: # f_gephi.write("%d,%d,%1.9f,%1.2f\n" % (com1, com2, W, 6 + math.log(W)/math.log(10)) ) # ... end # f_gephi.close() # if verbose: print"..Done!\n" ## ## ##... output the CC networks? confirm = raw_input( "..There are %d articles in the CC network.\n....do you want to create a gephi file with the CC networks at the articles level? (y/n): " % (len(G.nodes()))) if confirm == 'y': ## ... ini name = "CCnetwork(ccthr=%d, thr=%d, ref_journal_flag=%s).gdf" % ( ccthr, thr, ref_journal_flag) dst = os.path.join(out_dir, name) f_gephi = open(dst, 'w') ## ... prep nodes if verbose: print "....nodes" f_gephi.write( "nodedef>name VARCHAR,label VARCHAR,CCcom VARCHAR, Sub CCcom VARCHAR, Modularity VARCHAR, firstAU VARCHAR,journal VARCHAR,year VARCHAR,nb_arts DOUBLE,doi VARCHAR, volume VARCHAR, page VARCHAR\n" ) for refid in ref_index: if refid in partition: CCcom = partition[refid] if comm_size[CCcom] > thr: foo = ref_index[refid]['firstAU'] + ', ' + ref_index[ refid]['journal'] + ', ' + str( ref_index[refid]['year']) f_gephi.write( "%d,'%s',%s,%s,%1.6f,%s,%s,%d,%d,%s,%s,%s\n" % (refid, foo, str(CCcom), str(ref_index[refid]['SubCommID']), ref_index[refid]['modularity'], ref_index[refid]['firstAU'], ref_index[refid]['journal'], ref_index[refid]['year'], nA[refid], ref_index[refid]['doi'], ref_index[refid]['volume'], ref_index[refid]['page'])) ## ... prep edges if verbose: print "....edges" f_gephi.write( "edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE,nb_comm_refs DOUBLE" ) for i in CC_table: for j in CC_table[i]: if (i < j) and (i in partition) and (j in partition): commi_size = comm_size[partition[i]] commj_size = comm_size[partition[j]] if (commi_size > thr) and (commj_size > thr): w_ij = (1.0 * CC_table[i][j]) / math.sqrt( nA[i] * nA[j]) f_gephi.write("\n%d,%d,%f,%d" % (i, j, w_ij, CC_table[i][j])) ## ... end f_gephi.close() if verbose: print "..Done!\n" ############################## # Main Community Characteristics file type = "main" confirm = raw_input( "..Do you want to extract the characteristics for main communitise? \n Confirm (y/n):" ) if confirm == 'y': label = report.community_characteristics(in_dir, out_dir, type, ccthr, thr, ref_journal_flag, G, level, partition, list_nodes, art_table, verbose) ############################## # Sub Community Characteristics files if verbose: print "..Sub Computing communities caracteristics" confirm = raw_input( "..Do you want to extract the characteristics for sub communitise? \n Confirm (y/n):" ) if confirm == 'y': sub_label = dict() for com in list_nodes: type = str(com) subG = nx.subgraph(G, list_nodes[com]) level = len(community.generate_dendogram(subG)) - 1 sub_partition = community.best_partition(subG) sub_list_nodes = dict() for ref in sub_partition: sub_comm = sub_partition[ref] if sub_comm in sub_list_nodes: sub_list_nodes[sub_comm].append(ref) else: sub_list_nodes[sub_comm] = [] sub_label[com] = report.community_characteristics( in_dir, out_dir, type, ccthr, thr, ref_journal_flag, subG, level, sub_partition, sub_list_nodes, art_table, verbose, label) ############################## # Community Characteristics PDF generation confirm = raw_input( "..Do you want to generate the pdf files of characteristics for communitise? \n Confirm (y/n):" ) if confirm == 'y': report.latex(os.path.join(out_dir, "Report"), verbose) ## ################################### ## END return
type = str(com) subG = nx.subgraph(G, list_nodes[com]) level = len(community.generate_dendogram(subG)) - 1 sub_partition = community.best_partition(subG) sub_list_nodes = dict() for ref in sub_partition: sub_comm = sub_partition[ref] if sub_comm not in sub_list_nodes: sub_list_nodes[sub_comm] = [] sub_list_nodes[sub_comm].append(ref) sub_label[com] = report.community_characteristics(in_dir,out_dir,type,ccthr,thr,ref_journal_flag,subG,level,sub_partition,sub_list_nodes,art_table,doc_table,ref_index,verbose,label) ############################## # Community Characteristics PDF generation confirm = raw_input("..Do you want to generate the pdf files of characteristics for communitise? \n Confirm (y/n):") if confirm == 'y': report.latex(os.path.join(out_dir, "Report"), verbose) ## ################################### ## END return ## ################################################## def cmpval(x,y): if x[1]>y[1]: return -1 elif x[1]==y[1]: return 0 else: