Python ArnetMinerDatasetの例

プログラミング言語: Python

名前空間/パッケージ名: exp.influence2.ArnetMinerDataset

クラス/型: ArnetMinerDataset

hotexamples.comのコード掲載数: 9

Python ArnetMinerDataset - 9件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのexp.influence2.ArnetMinerDataset.ArnetMinerDatasetの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

dataFilename(5)

vectoriseDocuments(4)

overwriteVectoriser(3)

overwriteModel(3)

minDf(2)

findSimilarDocumentsLDA(2)

findSimilarDocumentsLSI(2)

overwrite(2)

readAuthorsAndDocuments(1)

overwriteGraph(1)

modelSelectionLSI(1)

coauthorsGraph(1)

maxRelevantAuthors(1)

coauthorsGraphFromAuthors(1)

loadVectoriser(1)

k(1)

getDataFieldDir(1)

expertsFromDocSimilarities(1)

matchExperts(1)

コード例 #1

ファイルを表示

ファイル: ArnetMinerDatasetProfile.py プロジェクト: pierrebo/wallhack

 def profileModelSelection(self): 
     dataset = ArnetMinerDataset(runLSI=False)   
     dataset.overwrite = True
     dataset.overwriteVectoriser = True
     dataset.overwriteModel = True
     
     dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt"
     
     ProfileUtils.profile('dataset.modelSelection()', globals(), locals())

コード例 #2

ファイルを表示

ファイル: ArnetMinerDatasetTest.py プロジェクト: pierrebo/wallhack

 def setUp(self): 
     numpy.random.seed(22) 
     numpy.set_printoptions(suppress=True, precision=3)
     #logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
     
     self.field = "Database"
     self.dataset = ArnetMinerDataset(additionalFields=[self.field])
     self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt"
     self.dataset.overwrite = True
     self.dataset.overwriteModel = True
     self.dataset.overwriteVectoriser = True

コード例 #3

ファイルを表示

ファイル: ArnetMinerDatasetProfile.py プロジェクト: pierrebo/wallhack

 def profileComputeLDA(self): 
     field = "Boosting"
     dataset = ArnetMinerDataset(field)
     dataset.overwrite = True
     dataset.overwriteVectoriser = True
     dataset.overwriteModel = True
     dataset.maxRelevantAuthors = 100
     dataset.k = 200
     dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt"
     
     ProfileUtils.profile('dataset.computeLDA()', globals(), locals())

コード例 #4

ファイルを表示

ファイル: ArnetMinerDatasetTest.py プロジェクト: pierrebo/wallhack

    def testFindSimilarDocumentsLDA(self): 
        self.dataset = ArnetMinerDataset()
        self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt"
        self.dataset.overwrite = True
        self.dataset.overwriteModel = True
        self.dataset.overwriteVectoriser = True
        self.dataset.k = 20
        
        #Check document is correct as well as authors 
        self.dataset.findSimilarDocumentsLDA(self.field)

        #Let's test order of ranking on larger dataset
        print("Running on 10000 dataset using LDA")
        dataset = ArnetMinerDataset()
        dataset.minDf = 10**-5
        dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt"
        dataset.vectoriseDocuments()
        relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")

コード例 #5

ファイルを表示

ファイル: ArnetMinerDatasetTest.py プロジェクト: pierrebo/wallhack

 def testFindSimilarDocuments(self): 
     field = "Object"
     self.dataset = ArnetMinerDataset()
     self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt"
     
     #Check document is correct as well as authors 
     self.dataset.vectoriseDocuments()
     relevantExperts = self.dataset.findSimilarDocumentsLSI(field)
     
     self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts)
     
     #Let's test order of ranking on larger dataset
     print("Running on 10000 dataset")
     dataset = ArnetMinerDataset()
     dataset.minDf = 10**-6
     dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt"
     dataset.vectoriseDocuments()
     relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks")
     
     self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)

コード例 #6

ファイルを表示

ファイル: ReputationExp.py プロジェクト: charanpald/wallhack

"""
Find out which experts exist in the DBLP dataset and how many abstracts 
they have. 
"""
import logging 
import sys 
import itertools 
import numpy 
from exp.influence2.ArnetMinerDataset import ArnetMinerDataset

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

dataset = ArnetMinerDataset() 
#dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" 

authorList, documentList, citationList = dataset.readAuthorsAndDocuments()
authorSet = set(itertools.chain.from_iterable(authorList))

print("Found all authors")
expertMatchesDict = {} 

for field in dataset.fields: 
    expertMatchesDict[field] = 0    
    
    for expert in dataset.expertsDict[field]: 
        if expert in authorSet: 
            expertMatchesDict[field] += 1
    
    expertMatchesDict[field] /= float(len(dataset.expertsDict[field]))

コード例 #7

ファイルを表示

ファイル: ReputationExp2.py プロジェクト: pierrebo/wallhack

from apgl.util.Evaluator import Evaluator

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
numpy.set_printoptions(suppress=True, precision=3, linewidth=160)
numpy.random.seed(21)

parser = argparse.ArgumentParser(description='Run reputation evaluation experiments')
parser.add_argument("-r", "--runLDA", action="store_true", help="Run Latent Dirchlet Allocation")
args = parser.parse_args()

averagePrecisionN = 50 
similarityCutoff = 0.30
ns = numpy.arange(5, 105, 5)
runLSI = not args.runLDA

dataset = ArnetMinerDataset(runLSI=runLSI) 
#dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt"

#dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt"
dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt"
#dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt"
#dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" 
dataset.overwriteGraph = True
dataset.overwriteModel = True

dataset.overwriteVectoriser = True 
dataset.vectoriseDocuments()
dataset.loadVectoriser()


X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx")

コード例 #8

ファイルを表示

ファイル: ArnetMinerDatasetTest.py プロジェクト: pierrebo/wallhack

class  ArnetMinerDatasetTest(unittest.TestCase):
    def setUp(self): 
        numpy.random.seed(22) 
        numpy.set_printoptions(suppress=True, precision=3)
        #logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
        
        self.field = "Database"
        self.dataset = ArnetMinerDataset(additionalFields=[self.field])
        self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt"
        self.dataset.overwrite = True
        self.dataset.overwriteModel = True
        self.dataset.overwriteVectoriser = True        
        
    def testVectoriseDocuments(self): 
        #Check document is correct as well as authors 
        self.dataset.vectoriseDocuments()
        
    def testFindSimilarDocuments(self): 
        field = "Object"
        self.dataset = ArnetMinerDataset()
        self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt"
        
        #Check document is correct as well as authors 
        self.dataset.vectoriseDocuments()
        relevantExperts = self.dataset.findSimilarDocumentsLSI(field)
        
        self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts)
        
        #Let's test order of ranking on larger dataset
        print("Running on 10000 dataset")
        dataset = ArnetMinerDataset()
        dataset.minDf = 10**-6
        dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt"
        dataset.vectoriseDocuments()
        relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks")
        
        self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)
        
    def testFindCoauthors(self): 
        
        #Check document is correct as well as authors 
        self.dataset.vectoriseDocuments()
        relevantExperts = self.dataset.findSimilarDocumentsLSI(self.field)
        self.dataset.coauthorsGraph(self.field, relevantExperts)
  

    def testCoauthorsGraphFromAuthors(self): 
        releventExperts = set(["Yuri Breitbart", "Hector Garcia-Molina"])
        
        graph, authorIndexer = self.dataset.coauthorsGraphFromAuthors(releventExperts)

        self.assertEquals(graph.get_edgelist(), [(0, 1), (0, 2), (0, 4), (1, 2), (1, 3)]) 
        
        self.assertEquals(graph.es["weight"], [1, 1, 1, 1, 1])
        self.assertEquals(graph.es["invWeight"], [1 ,1,1,1,1])
        
        self.assertEquals(len(authorIndexer.getIdDict()), 5)
       
       
    def testMatchExperts(self): 
        #TODO: 
        self.dataset.vectoriseDocuments()
        relevantExperts = self.dataset.findSimilarDocumentsLSI("DBMS")
        expertsSet = self.dataset.expertsDict[self.field]

        expertMatches = self.dataset.matchExperts(relevantExperts, expertsSet)
                     
        self.assertEquals(expertMatches, ['Nathan Goodman'])
        self.assertEquals(expertsSet, set(['Hector Garcia-Molina', 'Yuri Breitbart', 'Nathan Goodman']))
            
    def testExpertsFromDocSimilarities(self):
        self.dataset.authorList = [["Joe Bloggs", "Alfred Nobel"], ["Ian Hislop"], ["Alfred Nobel", "Ian Hislop"]]
        similarities = numpy.array([0.4, 0.5, 0.8]) 
        
        experts = self.dataset.expertsFromDocSimilarities(similarities)
        self.assertEquals(experts, ['Ian Hislop', 'Alfred Nobel', 'Joe Bloggs'])
        
    def testFindSimilarDocumentsLDA(self): 
        self.dataset = ArnetMinerDataset()
        self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt"
        self.dataset.overwrite = True
        self.dataset.overwriteModel = True
        self.dataset.overwriteVectoriser = True
        self.dataset.k = 20
        
        #Check document is correct as well as authors 
        self.dataset.findSimilarDocumentsLDA(self.field)

        #Let's test order of ranking on larger dataset
        print("Running on 10000 dataset using LDA")
        dataset = ArnetMinerDataset()
        dataset.minDf = 10**-5
        dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt"
        dataset.vectoriseDocuments()
        relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")
        
        #self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)

    @unittest.skip("")
    def testModelSelectionLSI(self): 
        self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt"
        self.dataset.overwrite = True
        self.dataset.overwriteModel = True
        self.dataset.overwriteVectoriser = True        
        
        self.dataset.vectoriseDocuments() 
        
        self.dataset.modelSelectionLSI()

コード例 #9

ファイルを表示

ファイル: MatchingExp.py プロジェクト: charanpald/wallhack

"""
Looking at all articles with an abstract, restrict and save the experts 
"""

import logging 
import sys 
import itertools 
import numpy 
from exp.influence2.ArnetMinerDataset import ArnetMinerDataset

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

dataset = ArnetMinerDataset() 
#dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" 

authorList, documentList, citationList = dataset.readAuthorsAndDocuments()
authorSet = set(itertools.chain.from_iterable(authorList))

print("Found all authors")
expertMatchesDict = {} 

for field in dataset.fields: 
    expertMatchesDict[field] = set([])    
    
    for expert in dataset.expertsDict[field]: 
        if expert in authorSet: 
            expertMatchesDict[field].add(expert)
            
    expertMatchesDict[field] = sorted(list(expertMatchesDict[field]))