コード例 #1
0
ファイル: spark.py プロジェクト: jomayee/Spark-PageRank
def outlink(li):
    if not li:
        return
    else:
        title = re.search('\<title\>(.*?)\<\/title\>', li).group(1)
        data = re.findall('\[\[(.*?)\]\]', li)
        if not data:
            data = []
        return (title, (1.0 / count, data))
コード例 #2
0
def link(kk):
    if not kk:
        return
    else:
        title = re.search('\<title\>(.*?)\<\/title\>', kk).group(1)
        rest = re.findall('\[\[(.*?)\]\]', kk)
        if not rest:
            rest = []
        return (title, (1.0 / len, rest))
コード例 #3
0
def createOlinks(lines):
    if not lines:
        return
    else:
        title = re.search('\<title\>(.*?)\<\/title\>', lines).group(1)
        olinks = re.findall('\[\[(.*?)\]\]', lines)
        if not olinks:
            olinks = []
        irank = 1.0 / linkCount
        return (
            title, (irank, olinks)
        )  # tuple returned which would be used in updateRank in format (title,(rank,outlinks))
コード例 #4
0
ファイル: spark.py プロジェクト: jomayee/Spark-PageRank
    for outlink in outlinks:
        listToRet.append((outlink, (rankToDist, "")))
    return listToRet


def reddata(x, y):
    rank = x[0] + y[0]
    outlinks = []
    if x[1]:
        outlinks = x[1]
    if y[1]:
        outlinks = y[1]
    return (rank, outlinks)


lines = sc.textFile(sys.argv[1])
titles = lines.flatMap(lambda x: re.findall('\<title\>(.*?)\<\/title\>', x))
datas = lines.flatMap(lambda x: re.findall('\[\[(.*?)\]\]', x)).distinct()
links = titles.union(datas).distinct()
count = links.count()
dataMap = lines.map(lambda x: outlink(x))

for i in range(0, 2):
    mapped = dataMap.flatMap(lambda x: distrank(x))
    dataMap = mapped.reduceByKey(lambda x, y: reddata(x, y))
    dataMap = dataMap.map(lambda x: (x[0], (x[1][0] * .85 + .15, x[1][1])))

dataMap.takeOrdered(100, key=lambda x: x[1][0])
orderedRank = dataMap.takeOrdered(100, key=lambda atuple: -atuple[1][0])
sc.parallelize(orderedRank).map(
    lambda x: x[0] + str("   ") + str(x[1][0])).saveAsTextFile("outputDir2")