def rebuild():
    es.reset_whole_index()
    files = list((CACHE_DIR / 'publication').walkfiles('*.json'))
    i = 0
    for file in files:
        i+=1
        publication = json.loads(file.text())
        (CACHE_DIR/'indexer.progress').write_text('%d / %d' % (i, len(files)))
        if not publication:
            continue
        es.index_publication(
                id=publication['id'],
                title=publication['title'],
                abstract=publication['abstract'],
                authors=publication['authors'],
                cited_ids=publication['citations'],
                reference_ids=publication['references'],
        )
    es.refresh()
Exemple #2
0
def rebuild():
    es.reset_whole_index()
    files = list((CACHE_DIR / 'publication').walkfiles('*.json'))
    i = 0
    for file in files:
        i += 1
        publication = json.loads(file.text())
        (CACHE_DIR / 'indexer.progress').write_text('%d / %d' %
                                                    (i, len(files)))
        if not publication:
            continue
        es.index_publication(
            id=publication['id'],
            title=publication['title'],
            abstract=publication['abstract'],
            authors=publication['authors'],
            cited_ids=publication['citations'],
            reference_ids=publication['references'],
        )
    es.refresh()
def _get_rank(cites):
    """
    :param cites: np.array presenting citation of i -> j
    :return: array of page ranks
    indices are assumed 0..N
    """
    n = cites.shape[0]
    p = np.array(cites, dtype=np.float64)
    alpha = 0.1

    for i in range(n):
        p[i, :] = p[i, :] * (1 - alpha) + np.ones((1, n)) * alpha
        p[i, :] /= np.sum(p[i, :])

    a = np.ones((n, n))
    for i in range(50):
        p = np.dot(p, p)
        (CACHE_DIR / 'pagerank.progress').write_text('{}%'.format(i * 2 + 2))
    a = np.dot(a, p)

    return a


if __name__ == '__main__':
    pubs = es._get_all_publications()
    ranks = get_rank(pubs)
    es.update_ranks(pubs, ranks)
    es.refresh()

print(es._get_all_publications()[20]['rank'])
def _get_rank(cites):
    """
    :param cites: np.array presenting citation of i -> j
    :return: array of page ranks
    indices are assumed 0..N
    """
    n = cites.shape[0]
    p = np.array(cites, dtype=np.float64)
    alpha = 0.1

    for i in range(n):
        p[i, :] = p[i, :] * (1 - alpha) + np.ones((1, n)) * alpha
        p[i, :] /= np.sum(p[i, :])

    a = np.ones((n, n))
    for i in range(50):
        p = np.dot(p, p)
        (CACHE_DIR / "pagerank.progress").write_text("{}%".format(i * 2 + 2))
    a = np.dot(a, p)

    return a


if __name__ == "__main__":
    pubs = es._get_all_publications()
    ranks = get_rank(pubs)
    es.update_ranks(pubs, ranks)
    es.refresh()

print(es._get_all_publications()[20]["rank"])