Python IdIndexerの例、sandbox.util.IdIndexer.IdIndexer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: DatasetUtils.py プロジェクト: charanpald/wallhack

 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X

コード例 #2

0

ファイルを表示

ファイル: DatasetUtils.py プロジェクト: charanpald/wallhack

    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

コード例 #3

0

ファイルを表示

ファイル: IdIndexerTest.py プロジェクト: kentwang/sandbox

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

コード例 #4

0

ファイルを表示

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

コード例 #5

0

ファイルを表示

ファイル: Static2IdValDataset.py プロジェクト: charanpald/wallhack

 def __init__(self, dataFilename, split=0.8):
     """
     Read datasets from the specified files.
     """
     printStep = 1000000        
     
     authorIndexer = IdIndexer() 
     itemIndexer = IdIndexer() 
     ratings = array.array("i")
     
     #Read train files 
     dataFile = open(dataFilename)
     for i, line in enumerate(dataFile): 
         if i % printStep == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split() 
         
         authorIndexer.append(vals[0])
         itemIndexer.append(vals[1])
         ratings.append(int(vals[2]))
         
     dataFile.close()
     logging.debug("Read file with " + str(i+1) + " lines")
         
     authorInds = numpy.array(authorIndexer.getArray())
     itemInds = numpy.array(itemIndexer.getArray())
     ratings = numpy.array(ratings)
     
     logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict())))
     logging.debug("Number of items: " + str(len(itemIndexer.getIdDict())))
     logging.debug("Number of ratings: " + str(ratings.shape[0]))
     
     del authorIndexer 
     del itemIndexer
     gc.collect()
     
     shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1)
     inds = numpy.random.permutation(ratings.shape[0])
     trainInds = inds[0:int(inds.shape[0]*split)]
     trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape)
     
     testInds = inds[int(inds.shape[0]*split):]
     testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape)
     
     del authorInds, itemInds, ratings 
     gc.collect()
     
     self.trainXList = [trainX]
     self.testXList = [testX]

コード例 #6

0

ファイルを表示

ファイル: IdIndexerTest.py プロジェクト: kentwang/sandbox

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

コード例 #7

0

ファイルを表示

ファイル: ArnetMinerDataset.py プロジェクト: charanpald/wallhack

 def coauthorsGraphFromAuthors2(self, relevantExperts, field): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" 
     dataFile = open(dataFileName)
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     articleDict = {}
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         try: 
             fields = [x.strip() for x in line.split(";")] 
             author = fields[1] + " " + fields[2]
             articleId = fields[4]
             
             if articleId not in articleDict.keys(): 
                 articleDict[articleId] = [author]
             else: 
                 articleDict[articleId].append(author)
         except IndexError:
             #Ignore bad lines 
             pass 
             
     dataFile.close()
                         
     for articleId in articleDict.keys(): 
         authors = articleDict[articleId]            
         
         if len(authors) != 0: 
             iterator = itertools.combinations(authors, 2)
         
             for author1, author2 in iterator: 
                 if author1 in relevantExperts and author2 in relevantExperts: 
                     author1Ind = authorIndexer.append(author1) 
                     author2Ind = authorIndexer.append(author2)
                         
                     author1Inds.append(author1Ind)
                     author2Inds.append(author2Ind)
 
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

コード例 #8

0

ファイルを表示

ファイル: DatasetUtils.py プロジェクト: charanpald/wallhack

    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

コード例 #9

0

ファイルを表示

ファイル: ArnetMinerDataset.py プロジェクト: charanpald/wallhack

 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

コード例 #10

0

ファイルを表示

ファイル: BoundExp.py プロジェクト: charanpald/wallhack

import matplotlib 
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt  
from sandbox.util.PathDefaults import PathDefaults 
from sandbox.util.IdIndexer import IdIndexer
from sandbox.util.Latex import Latex 
from apgl.graph.GraphUtils import GraphUtils 

"""
We try to figure out the change in L_i and L_{i+1}
"""

numpy.set_printoptions(suppress=True, precision=4)

dataDir = PathDefaults.getDataDir() + "kcore/"
indexer = IdIndexer()

node1Inds = array.array("i")
node2Inds = array.array("i")

Ls = []
us = []

boundFro = [] 
bound2 = []
ks = []
eyes = []
deltas = []

for i in range(1, 9): 
    print(i)

コード例 #11

0

ファイルを表示

ファイル: GenerateMendeleyCoauthors.py プロジェクト: charanpald/wallhack

def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): 
    
    if not os.path.isfile(authorXFileName): 
        fileObj = open(inputFileName)
        
        authorIndex = IdIndexer()
        docIndex = IdIndexer()
        scores = array.array("i")
        
        for i, line in enumerate(fileObj):
            if i % 500000 == 0: 
                logging.debug(i)
            vals = line.split()
            #logging.debug(vals[0], vals[1], vals[2])
            
            if reverse: 
                authorIndex.append(vals[1])
                docIndex.append(vals[0])
            else: 
                authorIndex.append(vals[0])
                docIndex.append(vals[1])
                
            score = int(vals[2])
            scores.append(int(score))
        
        rowInds = numpy.array(authorIndex.getArray())
        colInds = numpy.array(docIndex.getArray())
        
        Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds)))
            
        authorIndexerFile = open(authorIndexerFilename, "wb")
        pickle.dump(authorIndex, authorIndexerFile)
        authorIndexerFile.close()
        scipy.io.mmwrite(authorXFileName, Y)
        logging.debug("Saved matrix to " + authorXFileName)
    else: 
        logging.debug("File exists: " + authorXFileName)

コード例 #12

0

ファイルを表示

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]),
                          ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)

コード例 #13

0

ファイルを表示

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

コード例 #14

0

ファイルを表示

ファイル: IdIndexerTest.py プロジェクト: kentwang/sandbox

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)