try: num = int(sys.argv[2]) except IndexError: num = 504 with open(sys.argv[1]) as data_file: data = json.load(data_file) print(len(data)) g = Graph() for company in data.keys()[0:num]: g.add_nodes(company,data[company]) g.link_all_nodes() for i in xrange(0,num): matrix.append([]) print(str(i)+"\t"+data.keys()[i]) for j in xrange(0,num): a = g.return_links(data.keys()[i],data.keys()[j]) if a is None: a = [] matrix[i].append(len(a)) mat = np.array(matrix,dtype=float) print mat
def get_linkage_matrix(nameOfFile, size): matrix = [] try: num = size except IndexError: num = 504 #Collecting data from the links.json. This file essentially has all #the wikipedia links coming out of a particular company's page. with open(nameOfFile) as data_file: data = json.load(data_file) g = Graph() #Creating nodes for company in data.keys()[0:num]: g.add_nodes(company,data[company]) #Adding these nodes in a graph by considering #the common links. Here we check for direct equivalence. Unlike #the industry graph where we take into consideration semantics and using #NLP techniques g.link_all_nodes() for i in xrange(0,num): matrix.append([]) # print(str(i)+"\t"+data.keys()[i]) for j in xrange(0,num): a = g.return_links(data.keys()[i],data.keys()[j]) if a is None: a = [] matrix[i].append(len(a)) mat = np.array(matrix,dtype=float) # print mat #Alloting the scores in accordance to the number of links to a given #company for i in xrange(0,num): for j in xrange(0,num): if i!=j: if mat[i][i] != 0: mat[i][j] /= mat[i][i] # #normalize # def norm(array, identity_index, arr_len): # length = 0 # for i in xrange(0,arr_len): # if i == identity_index: # continue # length += array[i] # if length == 0: # return array # for i in xrange(0,arr_len): # if i == identity_index: # continue # array[i] = array[i]/length # return array #insert 1's after normalization for i in xrange(0,num): mat[i][i] = 1 mat[i] = norm(mat[i],i,num) #mat = np.round(mat,3) mat = np.multiply(mat, 0.6) return mat