Beispiel #1
0
 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X 
Beispiel #2
0
    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
Beispiel #3
0
    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
Beispiel #4
0
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")
 def __init__(self, dataFilename, split=0.8):
     """
     Read datasets from the specified files.
     """
     printStep = 1000000        
     
     authorIndexer = IdIndexer() 
     itemIndexer = IdIndexer() 
     ratings = array.array("i")
     
     #Read train files 
     dataFile = open(dataFilename)
     for i, line in enumerate(dataFile): 
         if i % printStep == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split() 
         
         authorIndexer.append(vals[0])
         itemIndexer.append(vals[1])
         ratings.append(int(vals[2]))
         
     dataFile.close()
     logging.debug("Read file with " + str(i+1) + " lines")
         
     authorInds = numpy.array(authorIndexer.getArray())
     itemInds = numpy.array(itemIndexer.getArray())
     ratings = numpy.array(ratings)
     
     logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict())))
     logging.debug("Number of items: " + str(len(itemIndexer.getIdDict())))
     logging.debug("Number of ratings: " + str(ratings.shape[0]))
     
     del authorIndexer 
     del itemIndexer
     gc.collect()
     
     shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1)
     inds = numpy.random.permutation(ratings.shape[0])
     trainInds = inds[0:int(inds.shape[0]*split)]
     trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape)
     
     testInds = inds[int(inds.shape[0]*split):]
     testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape)
     
     del authorInds, itemInds, ratings 
     gc.collect()
     
     self.trainXList = [trainX]
     self.testXList = [testX]
Beispiel #6
0
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")
 def coauthorsGraphFromAuthors2(self, relevantExperts, field): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" 
     dataFile = open(dataFileName)
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     articleDict = {}
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         try: 
             fields = [x.strip() for x in line.split(";")] 
             author = fields[1] + " " + fields[2]
             articleId = fields[4]
             
             if articleId not in articleDict.keys(): 
                 articleDict[articleId] = [author]
             else: 
                 articleDict[articleId].append(author)
         except IndexError:
             #Ignore bad lines 
             pass 
             
     dataFile.close()
                         
     for articleId in articleDict.keys(): 
         authors = articleDict[articleId]            
         
         if len(authors) != 0: 
             iterator = itertools.combinations(authors, 2)
         
             for author1, author2 in iterator: 
                 if author1 in relevantExperts and author2 in relevantExperts: 
                     author1Ind = authorIndexer.append(author1) 
                     author2Ind = authorIndexer.append(author2)
                         
                     author1Inds.append(author1Ind)
                     author2Inds.append(author2Ind)
 
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
Beispiel #8
0
    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
Beispiel #10
0
import matplotlib 
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt  
from sandbox.util.PathDefaults import PathDefaults 
from sandbox.util.IdIndexer import IdIndexer
from sandbox.util.Latex import Latex 
from apgl.graph.GraphUtils import GraphUtils 

"""
We try to figure out the change in L_i and L_{i+1}
"""

numpy.set_printoptions(suppress=True, precision=4)

dataDir = PathDefaults.getDataDir() + "kcore/"
indexer = IdIndexer()

node1Inds = array.array("i")
node2Inds = array.array("i")

Ls = []
us = []

boundFro = [] 
bound2 = []
ks = []
eyes = []
deltas = []

for i in range(1, 9): 
    print(i)
def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): 
    
    if not os.path.isfile(authorXFileName): 
        fileObj = open(inputFileName)
        
        authorIndex = IdIndexer()
        docIndex = IdIndexer()
        scores = array.array("i")
        
        for i, line in enumerate(fileObj):
            if i % 500000 == 0: 
                logging.debug(i)
            vals = line.split()
            #logging.debug(vals[0], vals[1], vals[2])
            
            if reverse: 
                authorIndex.append(vals[1])
                docIndex.append(vals[0])
            else: 
                authorIndex.append(vals[0])
                docIndex.append(vals[1])
                
            score = int(vals[2])
            scores.append(int(score))
        
        rowInds = numpy.array(authorIndex.getArray())
        colInds = numpy.array(docIndex.getArray())
        
        Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds)))
            
        authorIndexerFile = open(authorIndexerFilename, "wb")
        pickle.dump(authorIndex, authorIndexerFile)
        authorIndexerFile.close()
        scipy.io.mmwrite(authorXFileName, Y)
        logging.debug("Saved matrix to " + authorXFileName)
    else: 
        logging.debug("File exists: " + authorXFileName)
Beispiel #12
0
class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]),
                          ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)
Beispiel #13
0
    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
Beispiel #14
0
class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)