def experiment1(variables): exec "" locals().update(variables) from code.RelationalModels.node2vecLR import node2vecLR startTime = time.time() G = readData.readDataset(dataFolder, fName, no0Deg=True) GFirst = G retDict = {} rest, validationNodes = readData.readTrial(fName, i, percentValidation) #prune out nodes that don't exist in GFirst rest = graph_helper.prune0s(GFirst, rest) validationNodes = graph_helper.prune0s(GFirst, validationNodes) #split into folds folds = readData.splitNodeFolds(rest, numFolds) #vary training set foldsStart = time.time() #m is index into array, j is fold index. they are the same if we are looping over all folds #add up trainNodes trainNodes = [] for k in range(0, j + 1): trainNodes += folds[k] #add up rest of nodes testNodes = [] for k in range(j + 1, numFolds): testNodes += folds[k] actual_save_path = save_path + "_trial_" + str(i) + "_fold_" + str(j) lr = node2vecLR(dataFolder + fName, trainNodes, validationNodes, testNodes) lr.train() accuracyTrain = lr.predictBAE(testSet="train") accuracyValid = lr.predictBAE(testSet="valid") accuracyTest = lr.predictBAE(testSet="test") retDict['accuracyTrain'] = accuracyTrain retDict['accuracyValid'] = accuracyValid retDict['accuracyTest'] = accuracyTest np.save(save_path + "_BAE_Test", np.array(accuracyTest)) np.save(save_path + "_BAE_Tra", np.array(accuracyTrain)) np.save(save_path + "_BAE_Val", np.array(accuracyValid)) elapsed = time.time() - startTime print("trial: " + str(i) + ", fold: " + str(j) + ", time: " + str(elapsed)) return retDict
def experiment1(variables): #this is so that we can pass in all vars and update all locals to these vars exec "" locals().update(variables) if gpu != "cpu": import theano.sandbox.cuda theano.sandbox.cuda.use(gpu) from code.RelationalModels.RelationalLSTM import RelationalLSTM from code.RelationalModels.RelationalLSTM_2 import RelationalLSTM_2 from code.RelationalModels.RelationalRNNwMini import RelationalRNNwMini from code.RelationalModels.RelationalLSTMwMini import RelationalLSTMwMini from code.RelationalModels.RelationalLRAVG import RelationalLRAVG from blocks.filter import VariableFilter from blocks.roles import PARAMETER startTime = time.time() G = readData.readDataset(dataFolder, fName, sampleAttrs=sampleAttrs, averageNeighborAttr=avgNeighb, degree=degree, neighbDegree=neighbDegree, localClustering=localClustering, no0Deg=no0Deg) GFirst = G retDict = {} lastHs = None PPRs = None #read trial from file nodeData = readData.readTrial(dataFolder, fName, i, percentValidation, changeTrainValid) validationNodes2 = [] if changeTrainValid > 3: rest = nodeData[0] validationNodes = nodeData[1] validationNodes2 = nodeData[2] else: rest = nodeData[0] validationNodes = nodeData[1] #prune out nodes that don't exist in GFirst rest = graph_helper.prune0s(GFirst, rest) validationNodes = graph_helper.prune0s(GFirst, validationNodes) validationNodes2 = graph_helper.prune0s(GFirst, validationNodes2) #split into folds folds = readData.splitNodeFolds(rest, numFolds) #vary training set foldsStart = time.time() #m is index into array, j is fold index. they are the same if we are looping over all folds #add up trainNodes trainNodes = [] for k in range(0, j + 1): trainNodes += folds[k] #add up rest of nodes testNodes = [] for k in range(j + 1, numFolds): testNodes += folds[k] #if we don't want to partition into traditional validation set #we simply set train set to validation set if noValid: trainAll = trainNodes + validationNodes shuffle(trainAll) totalTrain = int(len(trainAll) * 0.4) trainNodes = trainAll[:totalTrain] validationNodes = trainAll[totalTrain:] # if we are doing PPR, then change G to the one associated with individual folds/trial # otherwise G is always the same if pageRankOrder == "for" or pageRankOrder == "back": PPRs = pickle.load( open( dataFolder + fName.replace("amazon_Music_64500", "amazon_Music_7500") + "_10pr_" + PRType + "_trial_" + str(i % 10) + "_fold_" + str(j) + ".p", 'rb')) G = readData.readDataset(dataFolder, fName, sampleAttrs=sampleAttrs, averageNeighborAttr=avgNeighb, degree=False, neighbDegree=neighbDegree, localClustering=localClustering, pageRankOrder=pageRankOrder, PPRs=PPRs, maxNeighbors=maxNeighbors, bias_self=bias_self, trainNodes=trainNodes + validationNodes, testNodes=testNodes, testLimit=testLimit, no0Deg=no0Deg) if degree: graph_helper.transferAttr(GFirst, G, 'degree') actual_save_path = save_path + "_trial_" + str(i) + "_fold_" + str(j) if not randInit: if netType == "LSTM": rnn = RelationalLSTM(G, trainNodes, validationNodes, dim=memory, batch_size=batch_size, num_epochs=num_epochs, save_path=actual_save_path, max_epochs=max_epochs, maxNeighbors=maxNeighbors, attrKey=attr1, debug=debug, usePrevWeights=usePrevWeights, epsilon=epsilon, pageRankOrder=pageRankOrder, batchesInferences=batchesInferences, usePro=usePro) elif netType == "LSTM2": rnn = RelationalLSTM_2(G, trainNodes, validationNodes, dim=memory, summary_dim=memory, batch_size=batch_size, num_epochs=num_epochs, save_path=actual_save_path, max_epochs=max_epochs, maxNeighbors=maxNeighbors, attrKey=attr1, debug=debug, usePrevWeights=usePrevWeights, epsilon=epsilon, pageRankOrder=pageRankOrder, batchesInferences=batchesInferences, usePro=usePro) elif netType == "RNNwMini": rnn = RelationalRNNwMini(G, trainNodes, validationNodes, perturb=perturb, dim=memory, mini_dim=mini_dim, summary_dim=memory + mini_dim, batch_size=batch_size, num_epochs=num_epochs, save_path=actual_save_path, max_epochs=max_epochs, maxNeighbors=maxNeighbors, attrKey=attr1, debug=debug, usePrevWeights=usePrevWeights, epsilon=epsilon, pageRankOrder=pageRankOrder, batchesInferences=batchesInferences) elif netType == "LSTMwMini": rnn = RelationalLSTMwMini(G, trainNodes, validationNodes, perturb=perturb, dim=memory, mini_dim=mini_dim, summary_dim=memory + mini_dim, batch_size=batch_size, num_epochs=num_epochs, save_path=actual_save_path, max_epochs=max_epochs, maxNeighbors=maxNeighbors, maxNeighbors2=maxNeighbors2, attrKey=attr1, debug=debug, usePrevWeights=usePrevWeights, epsilon=epsilon, pageRankOrder=pageRankOrder, batchesInferences=batchesInferences) elif "LRAVG" in netType: rnn = RelationalLRAVG(G, netType=netType.replace("LRAVG", ""), trainNodes=trainNodes, validationNodes=validationNodes, testNodes=testNodes) rnn.train() if lastH: lastHs = rnn.generateHidden("train") lastHs.update(rnn.generateHidden("valid")) #DON'T dynamically change test nodes labels accuracyTrain, curPredsTrain = rnn.makePredictions( trainNodes, maxNeighbors, changeLabel=False if changeTrainValid > -1 else True, lastH=False) retDict['accuracyTrain'] = accuracyTrain #DON'T dynamically change test nodes labels accuracyValid, curPredsValid = rnn.makePredictions( validationNodes, maxNeighbors, changeLabel=False if changeTrainValid > -1 else True, lastH=False) retDict['accuracyValid'] = accuracyValid accuracyValid2, curPredsValid2 = rnn.makePredictions(validationNodes2, maxNeighbors, lastH=False) retDict['accuracyValid2'] = accuracyValid2 #dynamically change test nodes labels if lastH: if "swap" in dataAug: #iterate through all nodes to get hidden states tempT, tempPred, hiddenRepT = rnn.makePredictions( trainNodes, maxNeighbors, changeLabel=False, lastH=True) tempV, tempPred, hiddenRepV = rnn.makePredictions( validationNodes, maxNeighbors, changeLabel=False, lastH=True) lastHs.update(hiddenRepT) lastHs.update(hiddenRepV) accuracyTest, curPredsTest, hiddenRep = rnn.makePredictions( testNodes, maxNeighbors, lastH=True) lastHs.update(hiddenRep) else: accuracyTest, curPredsTest = rnn.makePredictions(testNodes, maxNeighbors, lastH=False) retDict['accuracyTest'] = accuracyTest #save the actual predictions np.save(actual_save_path + "_pre_Tra", np.array(curPredsTrain.items())) np.save(actual_save_path + "_pre_Val", np.array(curPredsValid.items())) np.save(actual_save_path + "_pre_Val2", np.array(curPredsValid2.items())) np.save(actual_save_path + "_pre_Test", np.array(curPredsTest.items())) np.save(actual_save_path + "_BAE_Test", np.array(accuracyTest)) np.save(actual_save_path + "_BAE_Tra", np.array(accuracyTrain)) np.save(actual_save_path + "_BAE_Val", np.array(accuracyValid)) np.save(actual_save_path + "_BAE_Val2", np.array(accuracyValid2)) print("BAE_Tra: " + str(accuracyTrain)) print("BAE_Val: " + str(accuracyValid)) print("BAE_Val2: " + str(accuracyValid2)) print("BAE_Test: " + str(accuracyTest)) computeAccuracies(G, curPredsTrain, actual_save_path, "Tra") computeAccuracies(G, curPredsValid, actual_save_path, "Val") computeAccuracies(G, curPredsValid2, actual_save_path, "Val2") computeAccuracies(G, curPredsTest, actual_save_path, "Test") else: graph_helper.setLabels(G, trainNodes, validationNodes + validationNodes2, testNodes, changeTrainValid) #also dynamically change validation nodes if we desire #rnn.makePredictions(validationNodes, maxNeighbors) localsCopy = globals().copy() localsCopy.update(locals()) #localsCopy = locals().copy() test_bae, rnn2, best = trainRnnCollective(**localsCopy) #if randInit, replace with actual collective performance if randInit: retDict['accuracyTrain'] = best['Train_acc'] retDict['accuracyValid'] = best['Valid_acc'] retDict['accuracyTest'] = best['Test_acc'] print("test_bae: " + str(test_bae)) retDict['accuracyTest_C'] = test_bae elapsed = time.time() - startTime print("trial: " + str(i) + ", fold: " + str(j) + ", time: " + str(elapsed)) return retDict
import code.readData.readData as readData def printData(data, fileName): f = open(fileName + ".txt", 'w') for node in data: f.write(str(node) + "\n") trials = 10 #percentValidation=0.15 percentValidation = 0.1 numFolds = 9 dataSets = ["facebook_oneday_filtered"] for fName in dataSets: for i in range(0, trials): #read trial from file rest, validationNodes = readData.readTrial("../experiments/data/", fName, i, percentValidation) printData(validationNodes, "../experiments/data/" + fName + "_trial_" + str(i) + "_val") #split into folds folds = readData.splitNodeFolds(rest, numFolds) for j, fold in enumerate(folds): printData( fold, "../experiments/data/" + fName + "_trial_" + str(i) + "_fold_" + str(j))
def savePPRtype(fName, dataFolder, trial, fold, prType, debug=True): numFolds = 17 Gorig = readData.readDataset(dataFolder, fName) startOverall = time.time() rest, validationNodes = readData.readTrial(fName, trial, 0.15) folds = readData.splitNodeFolds(rest, numFolds) startFold = time.time() trainNodes = [] testNodes = [] for i in range(0, fold + 1): trainNodes += folds[i] trainNodes += validationNodes for i in range(fold + 1, numFolds): testNodes += folds[i] testNodes = testNodes #copy and then run page rank G = copy.deepcopy(Gorig) if prType == 'pos': pr, top10pr = computePersonalizedPR(G, trainNodes, testNodes, label='pos', debug=debug) pickle.dump( pr, open( dataFolder + fName + "_fullpr_pos_trial_" + str(trial) + "_fold_" + str(fold) + ".p", "wb")) pickle.dump( top10pr, open( dataFolder + fName + "_10pr_pos_trial_" + str(trial) + "_fold_" + str(fold) + ".p", "wb")) elif prType == 'neg': pr, top10pr = computePersonalizedPR(G, trainNodes, testNodes, label='neg', debug=debug) pickle.dump( pr, open( dataFolder + fName + "_fullpr_neg_trial_" + str(trial) + "_fold_" + str(fold) + ".p", "wb")) pickle.dump( top10pr, open( dataFolder + fName + "_10pr_neg_trial_" + str(trial) + "_fold_" + str(fold) + ".p", "wb")) elif prType == 'neutral': pr, top10pr = computePersonalizedPR(G, trainNodes, testNodes, debug=debug) #pickle.dump( pr, open( dataFolder+fName+"_fullpr_neutral_trial_"+str(trial)+"_fold_"+str(fold)+".p", "wb" ) ) pickle.dump( top10pr, open( dataFolder + fName + "_10pr_neutral_trial_" + str(trial) + "_fold_" + str(fold) + ".p", "wb")) endFold = time.time() print("Trial " + str(trial) + " Fold " + str(fold) + ": " + str(endFold - startFold))
def unitTest1(): fName = "facebook" numFolds = 10 maxFolds = 8 trial = 0 Gorig = readData.readDataset("data/" + fName + ".edges", "data/" + fName + ".attr", "data/" + fName + ".lab") rest, validationNodes = readData.readTrial(fName, trial, 0.15) folds = readData.splitNodeFolds(rest, numFolds) trainNodes = [] testNodes = [] for i in range(0, maxFolds): trainNodes += folds[i] trainNodes += validationNodes for i in range(maxFolds, numFolds): testNodes += folds[i] testNodes = testNodes[0:10] Gpos = copy.deepcopy(Gorig) testPRpos = computePersonalizedPR(Gpos, trainNodes, testNodes, label='pos') Gneg = copy.deepcopy(Gorig) testPRneg = computePersonalizedPR(Gneg, trainNodes, testNodes, label='neg') Gnn = copy.deepcopy(Gorig) testPRneutral = computePersonalizedPR(Gnn, trainNodes, testNodes) Gsimilar = copy.deepcopy(Gorig) testPRsimilar = computePersonalizedPR(Gsimilar, trainNodes, testNodes, label='similar') for node in testPRpos.keys(): sorted_PRpos = sorted(testPRpos[node].items(), key=operator.itemgetter(1)) sorted_PRneg = sorted(testPRneg[node].items(), key=operator.itemgetter(1)) sorted_PRneutral = sorted(testPRneutral[node].items(), key=operator.itemgetter(1)) sorted_PRsimilar = sorted(testPRsimilar[node].items(), key=operator.itemgetter(1)) norm2pneut = 0.0 norm2nn = 0.0 norm2pneg = 0.0 norm2psimilar = 0.0 for key in testPRpos[node].keys(): norm2pneut += math.pow( testPRpos[node][key] - testPRneutral[node][key], 2) norm2nn += math.pow( testPRneg[node][key] - testPRneutral[node][key], 2) norm2pneg += math.pow(testPRpos[node][key] - testPRneg[node][key], 2) norm2psimilar += math.pow( testPRpos[node][key] - testPRsimilar[node][key], 2) norm2pneut = math.sqrt(norm2pneut) norm2nn = math.sqrt(norm2nn) norm2pneg = math.sqrt(norm2pneg) norm2psimilar = math.sqrt(norm2psimilar) print("norm2pneut: " + str(norm2pneut)) print("norm2nn: " + str(norm2nn)) print("norm2pneg: " + str(norm2pneg)) print("norm2psimilar: " + str(norm2psimilar)) pickle.dump( testPRpos, open( "data/" + fName + "_pos_trial_" + str(trial) + "_fold_" + str(maxFolds) + ".p", "wb")) testPRpos2 = pickle.load( open( "data/" + fName + "_pos_trial_" + str(trial) + "_fold_" + str(maxFolds) + ".p", "rb")) print("here")