Ejemplo n.º 1
0
def experiment1(variables):
    exec ""
    locals().update(variables)
    from code.RelationalModels.node2vecLR import node2vecLR
    startTime = time.time()
    G = readData.readDataset(dataFolder, fName, no0Deg=True)
    GFirst = G
    retDict = {}

    rest, validationNodes = readData.readTrial(fName, i, percentValidation)

    #prune out nodes that don't exist in GFirst
    rest = graph_helper.prune0s(GFirst, rest)
    validationNodes = graph_helper.prune0s(GFirst, validationNodes)

    #split into folds
    folds = readData.splitNodeFolds(rest, numFolds)

    #vary training set
    foldsStart = time.time()
    #m is index into array, j is fold index. they are the same if we are looping over all folds

    #add up trainNodes
    trainNodes = []
    for k in range(0, j + 1):
        trainNodes += folds[k]

    #add up rest of nodes
    testNodes = []
    for k in range(j + 1, numFolds):
        testNodes += folds[k]

    actual_save_path = save_path + "_trial_" + str(i) + "_fold_" + str(j)

    lr = node2vecLR(dataFolder + fName, trainNodes, validationNodes, testNodes)
    lr.train()
    accuracyTrain = lr.predictBAE(testSet="train")
    accuracyValid = lr.predictBAE(testSet="valid")
    accuracyTest = lr.predictBAE(testSet="test")
    retDict['accuracyTrain'] = accuracyTrain
    retDict['accuracyValid'] = accuracyValid
    retDict['accuracyTest'] = accuracyTest

    np.save(save_path + "_BAE_Test", np.array(accuracyTest))
    np.save(save_path + "_BAE_Tra", np.array(accuracyTrain))
    np.save(save_path + "_BAE_Val", np.array(accuracyValid))

    elapsed = time.time() - startTime
    print("trial: " + str(i) + ", fold: " + str(j) + ", time: " + str(elapsed))
    return retDict
Ejemplo n.º 2
0
def experiment1(variables):

    #this is so that we can pass in all vars and update all locals to these vars
    exec ""
    locals().update(variables)
    if gpu != "cpu":
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(gpu)

    from code.RelationalModels.RelationalLSTM import RelationalLSTM
    from code.RelationalModels.RelationalLSTM_2 import RelationalLSTM_2
    from code.RelationalModels.RelationalRNNwMini import RelationalRNNwMini
    from code.RelationalModels.RelationalLSTMwMini import RelationalLSTMwMini
    from code.RelationalModels.RelationalLRAVG import RelationalLRAVG
    from blocks.filter import VariableFilter
    from blocks.roles import PARAMETER

    startTime = time.time()
    G = readData.readDataset(dataFolder,
                             fName,
                             sampleAttrs=sampleAttrs,
                             averageNeighborAttr=avgNeighb,
                             degree=degree,
                             neighbDegree=neighbDegree,
                             localClustering=localClustering,
                             no0Deg=no0Deg)
    GFirst = G

    retDict = {}

    lastHs = None
    PPRs = None
    #read trial from file

    nodeData = readData.readTrial(dataFolder, fName, i, percentValidation,
                                  changeTrainValid)
    validationNodes2 = []
    if changeTrainValid > 3:
        rest = nodeData[0]
        validationNodes = nodeData[1]
        validationNodes2 = nodeData[2]
    else:
        rest = nodeData[0]
        validationNodes = nodeData[1]

    #prune out nodes that don't exist in GFirst
    rest = graph_helper.prune0s(GFirst, rest)
    validationNodes = graph_helper.prune0s(GFirst, validationNodes)
    validationNodes2 = graph_helper.prune0s(GFirst, validationNodes2)

    #split into folds
    folds = readData.splitNodeFolds(rest, numFolds)

    #vary training set
    foldsStart = time.time()
    #m is index into array, j is fold index. they are the same if we are looping over all folds

    #add up trainNodes
    trainNodes = []
    for k in range(0, j + 1):
        trainNodes += folds[k]

    #add up rest of nodes
    testNodes = []
    for k in range(j + 1, numFolds):
        testNodes += folds[k]

    #if we don't want to partition into traditional validation set
    #we simply set train set to validation set
    if noValid:
        trainAll = trainNodes + validationNodes
        shuffle(trainAll)
        totalTrain = int(len(trainAll) * 0.4)
        trainNodes = trainAll[:totalTrain]
        validationNodes = trainAll[totalTrain:]

    # if we are doing PPR, then change G to the one associated with individual folds/trial
    # otherwise G is always the same
    if pageRankOrder == "for" or pageRankOrder == "back":
        PPRs = pickle.load(
            open(
                dataFolder +
                fName.replace("amazon_Music_64500", "amazon_Music_7500") +
                "_10pr_" + PRType + "_trial_" + str(i % 10) + "_fold_" +
                str(j) + ".p", 'rb'))
        G = readData.readDataset(dataFolder,
                                 fName,
                                 sampleAttrs=sampleAttrs,
                                 averageNeighborAttr=avgNeighb,
                                 degree=False,
                                 neighbDegree=neighbDegree,
                                 localClustering=localClustering,
                                 pageRankOrder=pageRankOrder,
                                 PPRs=PPRs,
                                 maxNeighbors=maxNeighbors,
                                 bias_self=bias_self,
                                 trainNodes=trainNodes + validationNodes,
                                 testNodes=testNodes,
                                 testLimit=testLimit,
                                 no0Deg=no0Deg)
        if degree:
            graph_helper.transferAttr(GFirst, G, 'degree')

    actual_save_path = save_path + "_trial_" + str(i) + "_fold_" + str(j)
    if not randInit:
        if netType == "LSTM":
            rnn = RelationalLSTM(G,
                                 trainNodes,
                                 validationNodes,
                                 dim=memory,
                                 batch_size=batch_size,
                                 num_epochs=num_epochs,
                                 save_path=actual_save_path,
                                 max_epochs=max_epochs,
                                 maxNeighbors=maxNeighbors,
                                 attrKey=attr1,
                                 debug=debug,
                                 usePrevWeights=usePrevWeights,
                                 epsilon=epsilon,
                                 pageRankOrder=pageRankOrder,
                                 batchesInferences=batchesInferences,
                                 usePro=usePro)
        elif netType == "LSTM2":
            rnn = RelationalLSTM_2(G,
                                   trainNodes,
                                   validationNodes,
                                   dim=memory,
                                   summary_dim=memory,
                                   batch_size=batch_size,
                                   num_epochs=num_epochs,
                                   save_path=actual_save_path,
                                   max_epochs=max_epochs,
                                   maxNeighbors=maxNeighbors,
                                   attrKey=attr1,
                                   debug=debug,
                                   usePrevWeights=usePrevWeights,
                                   epsilon=epsilon,
                                   pageRankOrder=pageRankOrder,
                                   batchesInferences=batchesInferences,
                                   usePro=usePro)
        elif netType == "RNNwMini":
            rnn = RelationalRNNwMini(G,
                                     trainNodes,
                                     validationNodes,
                                     perturb=perturb,
                                     dim=memory,
                                     mini_dim=mini_dim,
                                     summary_dim=memory + mini_dim,
                                     batch_size=batch_size,
                                     num_epochs=num_epochs,
                                     save_path=actual_save_path,
                                     max_epochs=max_epochs,
                                     maxNeighbors=maxNeighbors,
                                     attrKey=attr1,
                                     debug=debug,
                                     usePrevWeights=usePrevWeights,
                                     epsilon=epsilon,
                                     pageRankOrder=pageRankOrder,
                                     batchesInferences=batchesInferences)
        elif netType == "LSTMwMini":
            rnn = RelationalLSTMwMini(G,
                                      trainNodes,
                                      validationNodes,
                                      perturb=perturb,
                                      dim=memory,
                                      mini_dim=mini_dim,
                                      summary_dim=memory + mini_dim,
                                      batch_size=batch_size,
                                      num_epochs=num_epochs,
                                      save_path=actual_save_path,
                                      max_epochs=max_epochs,
                                      maxNeighbors=maxNeighbors,
                                      maxNeighbors2=maxNeighbors2,
                                      attrKey=attr1,
                                      debug=debug,
                                      usePrevWeights=usePrevWeights,
                                      epsilon=epsilon,
                                      pageRankOrder=pageRankOrder,
                                      batchesInferences=batchesInferences)
        elif "LRAVG" in netType:
            rnn = RelationalLRAVG(G,
                                  netType=netType.replace("LRAVG", ""),
                                  trainNodes=trainNodes,
                                  validationNodes=validationNodes,
                                  testNodes=testNodes)
        rnn.train()
        if lastH:
            lastHs = rnn.generateHidden("train")
            lastHs.update(rnn.generateHidden("valid"))

        #DON'T dynamically change test nodes labels
        accuracyTrain, curPredsTrain = rnn.makePredictions(
            trainNodes,
            maxNeighbors,
            changeLabel=False if changeTrainValid > -1 else True,
            lastH=False)
        retDict['accuracyTrain'] = accuracyTrain

        #DON'T dynamically change test nodes labels
        accuracyValid, curPredsValid = rnn.makePredictions(
            validationNodes,
            maxNeighbors,
            changeLabel=False if changeTrainValid > -1 else True,
            lastH=False)
        retDict['accuracyValid'] = accuracyValid

        accuracyValid2, curPredsValid2 = rnn.makePredictions(validationNodes2,
                                                             maxNeighbors,
                                                             lastH=False)
        retDict['accuracyValid2'] = accuracyValid2

        #dynamically change test nodes labels
        if lastH:
            if "swap" in dataAug:
                #iterate through all nodes to get hidden states
                tempT, tempPred, hiddenRepT = rnn.makePredictions(
                    trainNodes, maxNeighbors, changeLabel=False, lastH=True)
                tempV, tempPred, hiddenRepV = rnn.makePredictions(
                    validationNodes,
                    maxNeighbors,
                    changeLabel=False,
                    lastH=True)
                lastHs.update(hiddenRepT)
                lastHs.update(hiddenRepV)

            accuracyTest, curPredsTest, hiddenRep = rnn.makePredictions(
                testNodes, maxNeighbors, lastH=True)
            lastHs.update(hiddenRep)
        else:
            accuracyTest, curPredsTest = rnn.makePredictions(testNodes,
                                                             maxNeighbors,
                                                             lastH=False)
        retDict['accuracyTest'] = accuracyTest

        #save the actual predictions
        np.save(actual_save_path + "_pre_Tra", np.array(curPredsTrain.items()))
        np.save(actual_save_path + "_pre_Val", np.array(curPredsValid.items()))
        np.save(actual_save_path + "_pre_Val2",
                np.array(curPredsValid2.items()))
        np.save(actual_save_path + "_pre_Test", np.array(curPredsTest.items()))

        np.save(actual_save_path + "_BAE_Test", np.array(accuracyTest))
        np.save(actual_save_path + "_BAE_Tra", np.array(accuracyTrain))
        np.save(actual_save_path + "_BAE_Val", np.array(accuracyValid))
        np.save(actual_save_path + "_BAE_Val2", np.array(accuracyValid2))
        print("BAE_Tra: " + str(accuracyTrain))
        print("BAE_Val: " + str(accuracyValid))
        print("BAE_Val2: " + str(accuracyValid2))
        print("BAE_Test: " + str(accuracyTest))

        computeAccuracies(G, curPredsTrain, actual_save_path, "Tra")
        computeAccuracies(G, curPredsValid, actual_save_path, "Val")
        computeAccuracies(G, curPredsValid2, actual_save_path, "Val2")
        computeAccuracies(G, curPredsTest, actual_save_path, "Test")
    else:
        graph_helper.setLabels(G, trainNodes,
                               validationNodes + validationNodes2, testNodes,
                               changeTrainValid)

    #also dynamically change validation nodes if we desire
    #rnn.makePredictions(validationNodes, maxNeighbors)
    localsCopy = globals().copy()
    localsCopy.update(locals())
    #localsCopy = locals().copy()
    test_bae, rnn2, best = trainRnnCollective(**localsCopy)

    #if randInit, replace with actual collective performance
    if randInit:
        retDict['accuracyTrain'] = best['Train_acc']
        retDict['accuracyValid'] = best['Valid_acc']
        retDict['accuracyTest'] = best['Test_acc']

    print("test_bae: " + str(test_bae))
    retDict['accuracyTest_C'] = test_bae
    elapsed = time.time() - startTime

    print("trial: " + str(i) + ", fold: " + str(j) + ", time: " + str(elapsed))
    return retDict
Ejemplo n.º 3
0
import code.readData.readData as readData


def printData(data, fileName):
    f = open(fileName + ".txt", 'w')
    for node in data:
        f.write(str(node) + "\n")


trials = 10
#percentValidation=0.15
percentValidation = 0.1
numFolds = 9
dataSets = ["facebook_oneday_filtered"]

for fName in dataSets:
    for i in range(0, trials):

        #read trial from file
        rest, validationNodes = readData.readTrial("../experiments/data/",
                                                   fName, i, percentValidation)
        printData(validationNodes,
                  "../experiments/data/" + fName + "_trial_" + str(i) + "_val")

        #split into folds
        folds = readData.splitNodeFolds(rest, numFolds)
        for j, fold in enumerate(folds):
            printData(
                fold, "../experiments/data/" + fName + "_trial_" + str(i) +
                "_fold_" + str(j))
Ejemplo n.º 4
0
def savePPRtype(fName, dataFolder, trial, fold, prType, debug=True):
    numFolds = 17
    Gorig = readData.readDataset(dataFolder, fName)

    startOverall = time.time()

    rest, validationNodes = readData.readTrial(fName, trial, 0.15)
    folds = readData.splitNodeFolds(rest, numFolds)

    startFold = time.time()
    trainNodes = []
    testNodes = []
    for i in range(0, fold + 1):
        trainNodes += folds[i]
    trainNodes += validationNodes
    for i in range(fold + 1, numFolds):
        testNodes += folds[i]

    testNodes = testNodes
    #copy and then run page rank
    G = copy.deepcopy(Gorig)
    if prType == 'pos':
        pr, top10pr = computePersonalizedPR(G,
                                            trainNodes,
                                            testNodes,
                                            label='pos',
                                            debug=debug)
        pickle.dump(
            pr,
            open(
                dataFolder + fName + "_fullpr_pos_trial_" + str(trial) +
                "_fold_" + str(fold) + ".p", "wb"))
        pickle.dump(
            top10pr,
            open(
                dataFolder + fName + "_10pr_pos_trial_" + str(trial) +
                "_fold_" + str(fold) + ".p", "wb"))
    elif prType == 'neg':
        pr, top10pr = computePersonalizedPR(G,
                                            trainNodes,
                                            testNodes,
                                            label='neg',
                                            debug=debug)
        pickle.dump(
            pr,
            open(
                dataFolder + fName + "_fullpr_neg_trial_" + str(trial) +
                "_fold_" + str(fold) + ".p", "wb"))
        pickle.dump(
            top10pr,
            open(
                dataFolder + fName + "_10pr_neg_trial_" + str(trial) +
                "_fold_" + str(fold) + ".p", "wb"))
    elif prType == 'neutral':
        pr, top10pr = computePersonalizedPR(G,
                                            trainNodes,
                                            testNodes,
                                            debug=debug)
        #pickle.dump( pr, open( dataFolder+fName+"_fullpr_neutral_trial_"+str(trial)+"_fold_"+str(fold)+".p", "wb" ) )
        pickle.dump(
            top10pr,
            open(
                dataFolder + fName + "_10pr_neutral_trial_" + str(trial) +
                "_fold_" + str(fold) + ".p", "wb"))

    endFold = time.time()
    print("Trial " + str(trial) + " Fold " + str(fold) + ": " +
          str(endFold - startFold))
Ejemplo n.º 5
0
def unitTest1():
    fName = "facebook"
    numFolds = 10
    maxFolds = 8
    trial = 0
    Gorig = readData.readDataset("data/" + fName + ".edges",
                                 "data/" + fName + ".attr",
                                 "data/" + fName + ".lab")
    rest, validationNodes = readData.readTrial(fName, trial, 0.15)
    folds = readData.splitNodeFolds(rest, numFolds)

    trainNodes = []
    testNodes = []
    for i in range(0, maxFolds):
        trainNodes += folds[i]
    trainNodes += validationNodes
    for i in range(maxFolds, numFolds):
        testNodes += folds[i]

    testNodes = testNodes[0:10]
    Gpos = copy.deepcopy(Gorig)
    testPRpos = computePersonalizedPR(Gpos, trainNodes, testNodes, label='pos')
    Gneg = copy.deepcopy(Gorig)
    testPRneg = computePersonalizedPR(Gneg, trainNodes, testNodes, label='neg')
    Gnn = copy.deepcopy(Gorig)
    testPRneutral = computePersonalizedPR(Gnn, trainNodes, testNodes)
    Gsimilar = copy.deepcopy(Gorig)
    testPRsimilar = computePersonalizedPR(Gsimilar,
                                          trainNodes,
                                          testNodes,
                                          label='similar')

    for node in testPRpos.keys():
        sorted_PRpos = sorted(testPRpos[node].items(),
                              key=operator.itemgetter(1))
        sorted_PRneg = sorted(testPRneg[node].items(),
                              key=operator.itemgetter(1))
        sorted_PRneutral = sorted(testPRneutral[node].items(),
                                  key=operator.itemgetter(1))
        sorted_PRsimilar = sorted(testPRsimilar[node].items(),
                                  key=operator.itemgetter(1))

        norm2pneut = 0.0
        norm2nn = 0.0
        norm2pneg = 0.0
        norm2psimilar = 0.0
        for key in testPRpos[node].keys():
            norm2pneut += math.pow(
                testPRpos[node][key] - testPRneutral[node][key], 2)
            norm2nn += math.pow(
                testPRneg[node][key] - testPRneutral[node][key], 2)
            norm2pneg += math.pow(testPRpos[node][key] - testPRneg[node][key],
                                  2)
            norm2psimilar += math.pow(
                testPRpos[node][key] - testPRsimilar[node][key], 2)
        norm2pneut = math.sqrt(norm2pneut)
        norm2nn = math.sqrt(norm2nn)
        norm2pneg = math.sqrt(norm2pneg)
        norm2psimilar = math.sqrt(norm2psimilar)
        print("norm2pneut: " + str(norm2pneut))
        print("norm2nn: " + str(norm2nn))
        print("norm2pneg: " + str(norm2pneg))
        print("norm2psimilar: " + str(norm2psimilar))

    pickle.dump(
        testPRpos,
        open(
            "data/" + fName + "_pos_trial_" + str(trial) + "_fold_" +
            str(maxFolds) + ".p", "wb"))
    testPRpos2 = pickle.load(
        open(
            "data/" + fName + "_pos_trial_" + str(trial) + "_fold_" +
            str(maxFolds) + ".p", "rb"))
    print("here")