Python IdIndexer Beispiele, sandbox.util.IdIndexer.IdIndexer Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: DatasetUtils.py Projekt: charanpald/wallhack

 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X

Beispiel #2

0

Datei anzeigen

Datei: DatasetUtils.py Projekt: charanpald/wallhack

    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

Beispiel #3

0

Datei anzeigen

Datei: IdIndexerTest.py Projekt: kentwang/sandbox

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

Beispiel #4

0

Datei anzeigen

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

Beispiel #5

0

Datei anzeigen

Datei: Static2IdValDataset.py Projekt: charanpald/wallhack

 def __init__(self, dataFilename, split=0.8):
     """
     Read datasets from the specified files.
     """
     printStep = 1000000        
     
     authorIndexer = IdIndexer() 
     itemIndexer = IdIndexer() 
     ratings = array.array("i")
     
     #Read train files 
     dataFile = open(dataFilename)
     for i, line in enumerate(dataFile): 
         if i % printStep == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split() 
         
         authorIndexer.append(vals[0])
         itemIndexer.append(vals[1])
         ratings.append(int(vals[2]))
         
     dataFile.close()
     logging.debug("Read file with " + str(i+1) + " lines")
         
     authorInds = numpy.array(authorIndexer.getArray())
     itemInds = numpy.array(itemIndexer.getArray())
     ratings = numpy.array(ratings)
     
     logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict())))
     logging.debug("Number of items: " + str(len(itemIndexer.getIdDict())))
     logging.debug("Number of ratings: " + str(ratings.shape[0]))
     
     del authorIndexer 
     del itemIndexer
     gc.collect()
     
     shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1)
     inds = numpy.random.permutation(ratings.shape[0])
     trainInds = inds[0:int(inds.shape[0]*split)]
     trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape)
     
     testInds = inds[int(inds.shape[0]*split):]
     testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape)
     
     del authorInds, itemInds, ratings 
     gc.collect()
     
     self.trainXList = [trainX]
     self.testXList = [testX]

Beispiel #6

0

Datei anzeigen

Datei: IdIndexerTest.py Projekt: kentwang/sandbox

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

Beispiel #7

0

Datei anzeigen

Datei: ArnetMinerDataset.py Projekt: charanpald/wallhack

 def coauthorsGraphFromAuthors2(self, relevantExperts, field): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" 
     dataFile = open(dataFileName)
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     articleDict = {}
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         try: 
             fields = [x.strip() for x in line.split(";")] 
             author = fields[1] + " " + fields[2]
             articleId = fields[4]
             
             if articleId not in articleDict.keys(): 
                 articleDict[articleId] = [author]
             else: 
                 articleDict[articleId].append(author)
         except IndexError:
             #Ignore bad lines 
             pass 
             
     dataFile.close()
                         
     for articleId in articleDict.keys(): 
         authors = articleDict[articleId]            
         
         if len(authors) != 0: 
             iterator = itertools.combinations(authors, 2)
         
             for author1, author2 in iterator: 
                 if author1 in relevantExperts and author2 in relevantExperts: 
                     author1Ind = authorIndexer.append(author1) 
                     author2Ind = authorIndexer.append(author2)
                         
                     author1Inds.append(author1Ind)
                     author2Inds.append(author2Ind)
 
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

Beispiel #8

0

Datei anzeigen

Datei: DatasetUtils.py Projekt: charanpald/wallhack

    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

Beispiel #9

0

Datei anzeigen

Datei: ArnetMinerDataset.py Projekt: charanpald/wallhack

 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

Beispiel #10

0

Datei anzeigen

Datei: BoundExp.py Projekt: charanpald/wallhack

import matplotlib 
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt  
from sandbox.util.PathDefaults import PathDefaults 
from sandbox.util.IdIndexer import IdIndexer
from sandbox.util.Latex import Latex 
from apgl.graph.GraphUtils import GraphUtils 

"""
We try to figure out the change in L_i and L_{i+1}
"""

numpy.set_printoptions(suppress=True, precision=4)

dataDir = PathDefaults.getDataDir() + "kcore/"
indexer = IdIndexer()

node1Inds = array.array("i")
node2Inds = array.array("i")

Ls = []
us = []

boundFro = [] 
bound2 = []
ks = []
eyes = []
deltas = []

for i in range(1, 9): 
    print(i)

Beispiel #11

0

Datei anzeigen

Datei: GenerateMendeleyCoauthors.py Projekt: charanpald/wallhack

def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): 
    
    if not os.path.isfile(authorXFileName): 
        fileObj = open(inputFileName)
        
        authorIndex = IdIndexer()
        docIndex = IdIndexer()
        scores = array.array("i")
        
        for i, line in enumerate(fileObj):
            if i % 500000 == 0: 
                logging.debug(i)
            vals = line.split()
            #logging.debug(vals[0], vals[1], vals[2])
            
            if reverse: 
                authorIndex.append(vals[1])
                docIndex.append(vals[0])
            else: 
                authorIndex.append(vals[0])
                docIndex.append(vals[1])
                
            score = int(vals[2])
            scores.append(int(score))
        
        rowInds = numpy.array(authorIndex.getArray())
        colInds = numpy.array(docIndex.getArray())
        
        Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds)))
            
        authorIndexerFile = open(authorIndexerFilename, "wb")
        pickle.dump(authorIndex, authorIndexerFile)
        authorIndexerFile.close()
        scipy.io.mmwrite(authorXFileName, Y)
        logging.debug("Saved matrix to " + authorXFileName)
    else: 
        logging.debug("File exists: " + authorXFileName)

Beispiel #12

0

Datei anzeigen

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]),
                          ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)

Beispiel #13

0

Datei anzeigen

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

Beispiel #14

0

Datei anzeigen

Datei: IdIndexerTest.py Projekt: kentwang/sandbox

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)