Ejemplo n.º 1
0
def getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions):
    adjTensor, extraData = getMetaPathAdjacencyTensorData(graph, nodeIndex, metaPathPart)
    if metaPathPart[0] == metaPathPart[-1]:
        adjTensors = [adjTensor] * repetitions
    else:
        otherAdjTensor, extraData = getMetaPathAdjacencyTensorData(graph, nodeIndex, list(reversed(metaPathPart)))
        adjTensors = [adjTensor, otherAdjTensor]
    return adjTensors, adjTensor
def run():
    experiment = AuthorsShapeSimCPPAExperiment(
        None,
        'Most Similar CPPA ShapeSim Authors',
        outputFilePath=os.path.join('../results', 'authors', 'cppaShapeSim')
    )

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../data', 'graphWithCitations')))
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author']
    )
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    # Read paper citation counts
    paperCitationsFile = open(os.path.join('../data', 'paperCitationCounts'))
    paperCitationCounts = defaultdict(int)
    for line in paperCitationsFile:
        splitIndex = line.find(': ')
        count, title = int(line[:splitIndex]), line[splitIndex+2:].strip()
        paperCitationCounts[title] = int(count)

    # Compute author publication counts
    allPapers = set(nodeIndex['paper'].values())
    allAuthors = set(nodeIndex['author'].values())
    publicationCounts, citationCounts = defaultdict(int), defaultdict(int)
    for author in allAuthors:
        for node in graph.successors(author):
            if node in allPapers:
                publicationCounts[author] += 1
                citationCounts[author] += paperCitationCounts[node] if node in paperCitationCounts else 0

    # Tally conference total publication and citation counts
    conferencePublications, conferenceCitations = defaultdict(int), defaultdict(int)
    allConferences = set(nodeIndex['conference'].values())
    for conference in allConferences:
        for node in graph.successors(conference):
            if node in allPapers:
                conferencePublications[conference] += 1
                conferenceCitations[conference] += paperCitationCounts[node]
    with open(os.path.join('..', 'data', 'conferenceStats'), 'w') as f:
        cPickle.dump((conferencePublications, conferenceCitations), f)

    # Output author citation counts in descending order
    citationCountsList = sorted(citationCounts.iteritems(), key=operator.itemgetter(1))
    citationCountsList.reverse()
    with open(os.path.join('../data', 'authorCitationCounts'), 'w') as outputFile:
        map(lambda (author, count): outputFile.write('%d: %s\n' % (int(count), author)), citationCountsList)

    # Actually run the similarity experiments
    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cppaAdjTensor, extraData, citationCounts, publicationCounts)

    return citationCounts, publicationCounts, conferenceCitations, conferencePublications
Ejemplo n.º 3
0
def run():

    # Experiments to run with meta path lengths (map of length to trial paths)
    p, a, t, c = 'paper', 'author', 'term', 'conference'
    metaPathLengthExperiments = {
        3: [
           [c, p, c],
        ],
        5: [
           [c, p, c, p, c],
        ],
        # 7: [
        #    [c, p, c, p, c, p, c],
        # ],
        # 9: [
        #    [c, p, c, p, c, p, c, p, c],
        # ],
    }

    graph, nodeIndex = cPickle.load(open(os.path.join('..', 'data', 'graphWithCitations')))

    # Map of experiment length to experiment, which contains a tuple of average times
    metaPathLengthExperimentResults = defaultdict(list)

    trials = 3

    for pathLength in sorted(metaPathLengthExperiments.keys()):
        for metaPath in metaPathLengthExperiments[pathLength]:

            # Time getting adjacency tensor directly
            fullTime = timeit.timeit(lambda: getMetaPathAdjacencyTensorData(graph, nodeIndex, metaPath), number=trials)
            fullTime /= float(trials)

            metaPathPart = [c, p, c] if metaPath[0] == c else [a, p, a]
            repetitions = ((len(metaPath) - 1) / 2)

            # Find the partial meta path adjacency list
            adjTensors, adjTensor = getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions)
            partialTime = timeit.timeit(
                lambda: getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions), number=trials
            )
            partialTime /= float(trials)

            # Multiply for full adj tensor
            multiplyTime = timeit.timeit(lambda: multiplyFullAdjTensor(adjTensors, repetitions), number=trials)
            multiplyTime /= float(trials)

            # Output results
            metaPathLengthExperimentResults[pathLength].append((fullTime, partialTime, multiplyTime))
            print "Full Path: %.3f seconds, Partial Paths: %.3f seconds, Multiplication Only: %.3f, [%s]" % (
                fullTime, partialTime, multiplyTime, ', '.join(metaPath)
            )

    cPickle.dump(metaPathLengthExperimentResults, open('results', 'w'))
def run(citationCounts, publicationCounts):
    experiment = AuthorsShapeSimCPPARelativeExperiment(
        None,
        "Most Similar CPPA ShapeSim Authors lambda=0",
        outputFilePath=os.path.join("../results", "authors", "cppaShapeSim-Relative"),
    )

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join("../data", "graphWithCitations")))
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ["conference", "paper", "paper", "author"]
    )
    extraData["fromNodes"] = extraData["toNodes"]
    extraData["fromNodesIndex"] = extraData["toNodesIndex"]

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cppaAdjTensor, extraData, citationCounts, publicationCounts)
Ejemplo n.º 5
0
def run(citationCounts, publicationCounts):
    experiment = AuthorsShapeSimCPPARelativePartialExperiment(
        None,
        'Most Similar CPPA ShapeSim Authors lambda=0.5',
        outputFilePath=os.path.join('../results', 'authors', 'cppaShapeSim-RelativePartial')
    )

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../data', 'graphWithCitations')))
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author']
    )
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cppaAdjTensor, extraData, citationCounts, publicationCounts)
def run(conferenceCitations, conferencePublications):
    experiment = ConferencesShapeSimTPPCExperiment(
        None,
        'Most Similar TPPC ShapeSim Conferences',
        outputFilePath=os.path.join('../results', 'conferences', 'tppcShapeSim')
    )

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../data', 'graphWithCitations')))
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['term', 'paper', 'paper', 'conference']
    )
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    # Actually run the similarity experiments
    for testConference in testConferences:
        experiment.runFor(testConference, cppaAdjTensor, extraData, conferenceCitations, conferencePublications)
def run():
    experiment = AuthorsShapeSimCPPAExperiment(
        None,
        'Most Similar CPPA ShapeSim Authors',
        outputFilePath=os.path.join('../results', 'authors', 'cppaShapeSim'))

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('../data', 'graphWithCitations')))
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    # Read paper citation counts
    paperCitationsFile = open(os.path.join('../data', 'paperCitationCounts'))
    paperCitationCounts = defaultdict(int)
    for line in paperCitationsFile:
        splitIndex = line.find(': ')
        count, title = int(line[:splitIndex]), line[splitIndex + 2:].strip()
        paperCitationCounts[title] = int(count)

    # Compute author publication counts
    allPapers = set(nodeIndex['paper'].values())
    allAuthors = set(nodeIndex['author'].values())
    publicationCounts, citationCounts = defaultdict(int), defaultdict(int)
    for author in allAuthors:
        for node in graph.successors(author):
            if node in allPapers:
                publicationCounts[author] += 1
                citationCounts[author] += paperCitationCounts[
                    node] if node in paperCitationCounts else 0

    # Tally conference total publication and citation counts
    conferencePublications, conferenceCitations = defaultdict(
        int), defaultdict(int)
    allConferences = set(nodeIndex['conference'].values())
    for conference in allConferences:
        for node in graph.successors(conference):
            if node in allPapers:
                conferencePublications[conference] += 1
                conferenceCitations[conference] += paperCitationCounts[node]
    with open(os.path.join('..', 'data', 'conferenceStats'), 'w') as f:
        cPickle.dump((conferencePublications, conferenceCitations), f)

    # Output author citation counts in descending order
    citationCountsList = sorted(citationCounts.iteritems(),
                                key=operator.itemgetter(1))
    citationCountsList.reverse()
    with open(os.path.join('../data', 'authorCitationCounts'),
              'w') as outputFile:
        map(
            lambda (author, count): outputFile.write('%d: %s\n' %
                                                     (int(count), author)),
            citationCountsList)

    # Actually run the similarity experiments
    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cppaAdjTensor, extraData, citationCounts,
                          publicationCounts)

    return citationCounts, publicationCounts, conferenceCitations, conferencePublications
Ejemplo n.º 8
0
def imbalancedCitationsPublicationsExample():
    """
      Illustrative example of imbalanced citations / publications to verify ShapeSim is working correctly
    """

    graph = MultiDiGraph()
    authors = ['Alice', 'Bob', 'Carol', 'Dave', 'Ed', 'Frank']
    conference = 'KDD'

    # Citation & publication count configuration
    citationsPublications = {
        'Alice': (100, 10),
        'Bob': (80, 10),
        'Carol': (100, 100),
        'Dave': (50, 10),
        'Ed': (10, 10),
        'Frank': (1000, 100)
    }

    actualCitationsPublications = defaultdict(lambda: (0, 0))

    # Helper functions for repeatedly adding papers to the graph
    addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)]
    addCitationsToPaper = lambda n, paper, author: [addCitationPaper(paper, author) for _ in itertools.repeat(None, n)]

    # Helper for getting the next id
    def __getNextId():
        global nextId
        oldId = nextId
        nextId += 1
        return oldId

    def addPublicationPaper(author):
        """
          Helper method to add a 'publication' paper, connected to both an author and a conference
        """
        paper = "%s's Paper %d" % (author, (__getNextId()))
        graph.add_node(paper)
        graph.add_edges_from([(author, paper), (paper, author), (paper, conference), (conference, paper)])

        citationCount, publicationCount = actualCitationsPublications[author]
        actualCitationsPublications[author] = (citationCount, publicationCount + 1)

        return paper

    def addCitationPaper(citedPaper, citedAuthor):
        """
          Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites
        """
        citingPaper = "Citing Paper %d" % __getNextId()
        graph.add_node(citingPaper)
        graph.add_edges_from([(conference, citingPaper), (citingPaper, conference), (citingPaper, citedPaper)])

        citationCount, publicationCount = actualCitationsPublications[citedAuthor]
        actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)

        return citingPaper

    allPapers = []

    # Construct the graph
    graph.add_nodes_from(authors + [conference])
    for authorName in citationsPublications:
        citationCount, publicationCount = citationsPublications[authorName]

        # Add citations & publications to author
        authorPapers = addPapersToAuthor(publicationCount, authorName)
        allPapers.extend(authorPapers)
        citationsPerPaper = citationCount / publicationCount
        for paper in authorPapers:
            citingPapers = addCitationsToPaper(citationsPerPaper, paper, authorName)
            allPapers.extend(citingPapers)

    nodeIndex = {
        'paper': {i: allPapers[i] for i in xrange(0, len(allPapers))},
        'conference': {0: 'KDD'},
        'author': {0: 'Alice', 1: 'Bob', 2: 'Carol', 3: 'Dave', 4: 'Ed', 5: 'Frank'}
    }

    # Test PathSim / NeighborSim
    cpaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'author'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']
    neighborSimMostSimilar, similarityScores = findMostSimilarNodes(
        cpaAdjMatrix, 'Alice', extraData, method=getNeighborSimScore
    )

    # Test ShapeSim
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author']
    )
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']
    shapeSimMostSimilar, similarityScores = findMostSimilarNodes(
        cppaAdjTensor, 'Alice', extraData, method=getNumpyShapeSimScore, alpha=1.0
    )

    # Output similarity scores
    for name, mostSimilar in [('NeighborSim', neighborSimMostSimilar), ('ShapeSim', shapeSimMostSimilar)]:
        print('\n%s Most Similar to "%s":' % (name, 'Alice'))
        mostSimilarTable = texttable.Texttable()
        rows = [['Author', 'Score']]
        rows += [[name, score] for name, score in mostSimilar]
        mostSimilarTable.add_rows(rows)
        print(mostSimilarTable.draw())