コード例 #1
0
def runTransformer(FolderName, NumFilesToProcess):
    fileList = os.listdir(str(FolderName))
    print("------------------------------------------")
    print(fileList)
    print("------------------------------------------")

    desiredFiles = []
    for i in range(NumFilesToProcess):
        desiredFiles.append(fileList[i])
    print(len(desiredFiles))
    termSets = []
    for dfile in desiredFiles:
        termSet = []
        # rawPageContent = DataTransformer.openRawHTMLsTxt()
        firstFiltered = DataTransformer.getCoreList(str(FolderName)+"/" + str(dfile))
        secondFiltered = DataTransformer.filterCoreList(firstFiltered)
        for term in secondFiltered:
            termSet.append(term)
        termSets.append((str(dfile),termSet))
    for termSet in termSets:
        outputTerm(termSet, str(termSet[0]))

    return termSets
コード例 #2
0
def runner(IndexFolderName, ContentFolderName, QueriesTxtName, K):
    f = open("./Output.txt", "w", encoding='utf-8')

    queries, content = RankedRetrieval.tokenTransform(str(QueriesTxtName))
    for query in queries:
        f.write(str(content[queries.index(query)]) + "\n")  #write raw query
        f.write(str(query[1]) +
                "\n")  #write toeknized and data transformed query
        scoreList = RankedRetrieval.getScore(query[1])

        if (len(scoreList) < int(K)):
            for score in scoreList:
                f.write(str(score[0]) + "  " + str(score[0]) +
                        "\n")  #write documentID <tab> documentName

                somewhatCoreContent = DataTransformer.getCoreList(
                    "./" + str(ContentFolderName) + "/" + str(score[0]) +
                    ".txt")
                filteredContent = DataTransformer.filterCoreList(
                    somewhatCoreContent)
                snippet = str(filteredContent)[:200]
                f.write(snippet + "\n")  #write first 200 bytes

                f.write(str(score[2]) + "\n")  #write score
                for token in query[1]:
                    f.write(
                        str(token) + ": " +
                        str(score[1][query[1].index(token)]) + " ")
                # f.write(str(query[1][0]) + ": "+ str(score[1][0]) + " "
                # + str(query[1][1]) + ": "+ str(score[1][1]) + " "
                # + str(query[1][2]) + ": "+ str(score[1][2]) + " "
                # + str(query[1][3]) + ": "+ str(score[1][3]) + "\n") #write contribution

                f.write("\n")

        if (len(scoreList) >= int(K)):
            for i in range(K):
                f.write(
                    str(scoreList[i][0]) + "  " + str(scoreList[i][0]) +
                    "\n")  #write documentID <tab> documentName

                somewhatCoreContent = DataTransformer.getCoreList(
                    "./" + str(ContentFolderName) + "/" +
                    str(scoreList[i][0]) + ".txt")
                filteredContent = DataTransformer.filterCoreList(
                    somewhatCoreContent)
                snippet = str(filteredContent)[:200]
                f.write(snippet + "\n")  #write first 200 bytes

                f.write(str(scoreList[i][2]) + "\n")  #write score
                for token in query[1]:
                    f.write(
                        str(token) + ": " +
                        str(scoreList[i][1][query[1].index(token)]) + " ")

                # f.write(str(query[1][0]) + ": "+ str(scoreList[i][1][0]) + " "
                # + str(query[1][1]) + ": "+ str(scoreList[i][1][1]) + " "
                # + str(query[1][2]) + ": "+ str(scoreList[i][1][2]) + " "
                # + str(query[1][3]) + ": "+ str(scoreList[i][1][3]) + "\n") #write contribution

                f.write("\n")

        f.write("\n")
        f.write("\n")
    f.close()