Exemple #1
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    index_data = {}

    collect = Collection()

    qf = loadCranQry(query_doc)
    for q in qf:
        if (qf[q].qid == query_id):
            query_text = qf[q].text

    # loading index_file
    with open(index_file, "r") as read_file:
        index_data = json.load(read_file).items()

    queryprocess = QueryProcessor(query_text, index_data, collect.docs)

    querytokens = queryprocess.preprocessing()
    print("process alg:", process_alg)
    if (process_alg == '0'):
        result = queryprocess.booleanQuery(querytokens)
        print("Query results", result)
    elif (process_alg == '1'):
        result = queryprocess.vectorQuery(querytokens)
    else:
        print("enter 0 for boolean query and 1 for vector query")
Exemple #2
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    # Ensure args are valid
    if len(argv) is not 5:
        print(
            "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>"
        )
        return

    # Grab arguments
    index_file_loc = argv[1]
    processing_algo = argv[2]
    query_file_path = argv[3]
    query_id = argv[4]

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_file_loc)

    # Get the document collection
    cf = CranFile("cran.all")

    # Get the query collection
    qc = loadCranQry(query_file_path)

    # Get the query
    if 0 < int(query_id) < 10:
        query_id = '00' + str(int(query_id))
    elif 9 < int(query_id) < 100:
        query_id = '0' + str(int(query_id))
    try:
        query = qc[query_id].text
    except KeyError:
        print("Invalid query id", query_id)
        return

    # Initialize a query processor
    qp = QueryProcessor(query, ii, cf)

    # Do query
    if int(processing_algo) is 0:
        result = qp.booleanQuery()
        if result:
            print("Results:", ", ".join(str(x) for x in qp.booleanQuery()))
        else:
            print("Results: None")
    elif int(processing_algo) is 1:
        result = qp.vectorQuery(k=3)
        print("Results:")
        for r in result:
            print("Doc", r[0], "Score", r[1])
    else:
        print("Invalid processing algorithm",
              processing_algo + ". Use 0 (boolean) or 1 (vector).")
def query():
    ''' the main query processing program, using QueryProcessor'''

    II = index.InvertedIndex()
    index_file = sys.argv[1]
    index_file = II.load(index_file)

    proc_alg = sys.argv[2]
    proc_alg = proc_alg

    q_text = sys.argv[3]
    q_text = q_text

    qid = sys.argv[4]
    qid = qid

    qrys = cranqry.loadCranQry(q_text)  #qrys is a dict
    #qrys =  cranqry.loadCranQry('../CranfieldDataset/query.text')  #can also be hard-coded like this one

    #qid = '069'   #example of hard-coding a query id
    qp = QueryProcessor(
        qrys[qid].text, index_file,
        'cran.all')  #qid, and index_file are to be passed by the user

    #print qp.booleanQuery()
    if proc_alg == '0':
        qp.booleanQuery()
        print qp.booleanQuery()
    elif proc_alg == '1':
        qp.vectorQuery(3)  #returning top 3 ranked results for the vector model
        print qp.vectorQuery(3)
Exemple #4
0
def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    qrys = loadCranQry(queryfilename)
#    for q in qrys:
#        print(q, qrys[q].text)

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load(indexfilename)
#    print("index loaded")

    cf = CranFile('cran.all')

    queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults)
    if processingalgorithm == '0' :
        queryProcessor.preprocessing()
        queryProcessor.queryId = queryid
        results = queryProcessor.booleanQuery()
    if processingalgorithm == '1':
        queryProcessor.queryId = queryid
        results = queryProcessor.vectorQuery(queryProcessor.numofresults)
    return results
Exemple #5
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    #i = QueryProcessor()

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    #reading command line arguments
    index = sys.argv[1]
    algo = sys.argv[2]
    file = sys.argv[3]
    qid = sys.argv[4]

    index += ".p"

    #number of results to display
    k = 6

    #loading the query.text
    qrys = cranqry.loadCranQry(file)
    obj = QueryProcessor(qrys[qid].text, index)
    obj.preprocessing()

    if algo == '0':
        obj.booleanQuery()
    if algo == '1':

        obj.vectorQuery(k)
def query(index_file, algorithm, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    query_file = cranqry.loadCranQry(query_file)  # loading file
    index_items = InvertedIndex()
    index_items = index_items.load(index_file)
    cran_file = cran.CranFile('cran.all')
    query_verify = QueryProcessor(query_file, index_items, cran_file.docs)
    query_verify.preprocessing()
    results = None
    if algorithm == '0':  # if algorithm is 0 it represents boolean model
        results = query_verify.booleanQuery(query_id)
    elif algorithm == '1':  # if algorithm is 1 it is vector model
        results = query_verify.vectorQuery(3, query_id)
    print(results)
def query(index_file, processing_algorithm, query_file, query_id):
    """ the main query processing program, using QueryProcessor"""
    cranqryobj = cranqry.loadCranQry(query_file)
    dict_query = {}
    for q in cranqryobj:
        dict_query[q] = cranqryobj[q].text
    query_txt = dict_query[query_id]
    indexObject = index.InvertedIndex()
    items = indexObject.load(index_file)
    QPobj = QueryProcessor(query_txt, items, index_file)
    QPobj.preprocessing()
    doc_ids = []
    if processing_algorithm == "0":  # boolean Query
        doc_ids = QPobj.booleanQuery()
    elif processing_algorithm == "1":  # vector Query
        doc_ids = QPobj.vectorQuery(3)  # first 3 documents based on ranking
    else:
        print("Invalid Processing algorithm")
    print(doc_ids)
    return doc_ids
def query(index_file, model_type, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    #load documents
    inputdocument = cran.CranFile("cran.all")
    #load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    #load query processed files
    queries = loadCranQry(query_file)

    qp = QueryProcessor(queries, index, inputdocument, query_id)

    if model_type == 0:
        Booleanres = qp.booleanQuery()
        print(Booleanres)
    if model_type == 1:
        vectorres = qp.vectorQuery(3)
        print(vectorres)
    if model_type == 2:
        qp.BatchEvaluation()
def eval(testOn):
    k = 10  # k the number of top k pairs of (docID, similarity) to get from vectorQuery
    dictQ_ID = []
    indexFile = sys.argv[1]  #v "src/Data/tempFile"
    queryText = sys.argv[2]
    qrelsText = sys.argv[3]
    dictOfQuery = {}
    dictQrelsText = {}
    docCollection = CranFile('./CranfieldDataset/cran.all')
    NDCGScoreBool = []
    numberOfQueries = int(sys.argv[4])
    NDCGScoreVector = []
    #indexFile           = "src/Data/tempFile"
    #queryText           = 'src/CranfieldDataset/query.text'
    #qrelsText           = 'src/CranfieldDataset/qrels.text'
    #numberOfQueries     = 50
    numberOfTimeToLoop = 5

    #Loads Files
    listOfQueryRelsMaping = readFile(qrelsText)
    queryFile = loadCranQry(queryText)

    #Data Need
    for i in range(numberOfTimeToLoop):

        #Get random Queiry
        dictOfQuery = getRandomQuery(queryFile, numberOfQueries)
        if testOn:
            assert len(dictOfQuery
                       ) == numberOfQueries, "Error are getting random query"

        # Return all query
        # dictOfQuery = getAllDataItems(queryFile)
        # if testOn:
        #     assert len(dictOfQuery) == 225, "Error are getting random query"

        #get list of Query result from qrel.txt
        dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping,
                                                 dictOfQuery)
        if testOn:
            assert len(dictQrelsText
                       ) == numberOfQueries, "Error number Of Queries to large"

        start = timer()
        queryProcessor = QueryProcessor(
            "", indexFile,
            docCollection.docs)  # This is an extremely expensive process\
        end = timer()

        if testOn:
            print("Time for creating QueryProcessor:", end - start)
        countDoc = 0
        start = timer()

        dictQ_ID = []
        for qid, queryText in dictOfQuery.items():
            countDoc += 1

            dictQ_ID.append(qid)

            if testOn:
                print("QID:", qid)
            start = timer()
            queryProcessor.loadQuery(queryText)
            end = timer()
            if testOn:
                print("Time for Load:", end - start)
                print("qrels: ", dictQrelsText[qid])

            start = timer()
            docIDs = queryProcessor.booleanQuery(
            )  # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003]
            #docIDs_1 = queryProcessor.booleanQuery_1()
            end = timer()
            if testOn:
                print("Time for booleanQuery:", end - start)

            start = timer()
            listOfDocIDAndSimilarity = queryProcessor.vectorQuery(
                k
            )  # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]]
            #vectorQueryDict[qid] = dictOfDocIDAndSimilarity
            end = timer()
            if testOn:
                print("Time for vectorQuery:", end - start)
                print("booleanQuery:", docIDs)

            #For Boolean part
            start = timer()
            yTrue = []
            yScore = []
            for docID in docIDs:
                yScore.append(1)
                if docID in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreBool.append(0)
            else:
                NDCGScoreBool.append(score)
            end = timer()
            if testOn:
                print("Time for  Boolean ndcg:", end - start)

            #For Vector part
            start = timer()
            yTrue = []
            yScore = []
            if testOn:
                print("vectorQuery:", listOfDocIDAndSimilarity)
            for docID_Score in listOfDocIDAndSimilarity:
                yScore.append(float(docID_Score[1]))
                if docID_Score[0] in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreVector.append(0)
            else:
                NDCGScoreVector.append(score)
            end = timer()
            if testOn:
                print("Time for  Vector ndcg:", end - start)
        print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID)

        if testOn:
            for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool,
                                                   NDCGScoreVector):
                print("QID", QID, "Boolean Model:", boolScore, "Vector Model",
                      vectorScore)

    print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==",
          len(NDCGScoreVector))

    print('\nThe Avg NDCG Score')
    vectorAvg = avg(NDCGScoreVector)
    BoolAvg = avg(NDCGScoreBool)
    print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:",
          vectorAvg)
    end = timer()
    if testOn:
        print("\n\nTime for running ", countDoc, " queries:", end - start)

    print('\nThe P-Value')
    p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector)
    p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector)
    print("T-Test P-value: ", p_va_ttest)
    print("Wilcoxon P-value: ", p_va_wilcoxon)
    print('Done')
Exemple #10
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    #ndexFile       = "src/Data/tempFile"
    #model_selection = "0"
    #queryText       = 'src/CranfieldDataset/query.text'
    #query_id        = "226"
    docCollection = CranFile('CranfieldDataset/cran.all')
    indexFile = sys.argv[1]
    model_selection = sys.argv[2]
    queryText = sys.argv[3]
    query_id = sys.argv[4]
    query_id = str(query_id).zfill(3)  # need for number 001 or 050
    queryTest = ""
    queryFile = loadCranQry(queryText)

    #Data Need
    if not model_selection == '2':
        queryTuple = queryFile[query_id]

        if query_id == queryTuple.qid:
            queryTest = queryTuple.text

    queryProcessor = QueryProcessor(queryTest, indexFile, docCollection.docs)
    if model_selection == "0":
        docIDs = queryProcessor.booleanQuery()
        print("Boolean")
        print("Total number of documents is:",
              str(len(docIDs)) + "\nTheir DocIDs our:" + str(docIDs))

    elif model_selection == "1":
        print("Vector")
        print(queryProcessor.vectorQuery(3))

    elif model_selection == "2":
        numberOfTimeToLoop = 5
        numberOfQueries = int(query_id)
        k = 10
        bresults = []
        vresults = []
        #Data Need
        for _ in range(numberOfTimeToLoop):
            #get list of Query result from qrel.txt

            dictOfQuery = getRandomQuery(queryFile, numberOfQueries)
            queryProcessor = QueryProcessor(
                "", indexFile,
                docCollection.docs)  # This is an extremely expensive process\

            start = timer()
            for __, queryText in dictOfQuery.items():
                queryProcessor.loadQuery(queryText)
                #docIDs = queryProcessor.booleanQuery()
                queryProcessor.booleanQuery()
            end = timer()
            #           print("Run:",i+1, "\nTime for boolean model on Query (",numberOfQueries,") \nTime:", end - start, "\n")
            bresults.append(end - start)
            start = timer()
            for __, queryText in dictOfQuery.items():
                #listOfDocIDAndSimilarity = queryProcessor.vectorQuery(k)
                queryProcessor.vectorQuery(k)
            end = timer()
            #            print("Run:",i+1, "\nTime for Vector model on Query (",numberOfQueries,") \nTime:", end - start, "\n")
            vresults.append(end - start)

        print("Model\t\tRun:" +
              '\t\t\tRun:'.join(map(str,
                                    range(numberOfTimeToLoop + 1)[1:])))
        print()
        print("Boolean Model: \t" + '\t'.join(map(str, bresults)))
        print()
        print("Vector Model: \t" + '\t'.join(map(str, vresults)))
        print()
Exemple #11
0
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries):

    # ToDo
    actual = []
    #
    if numberofrandomqueries > 225:
        raise Exception('please enter query count less than or equal to 225')
    qrys = loadCranQry("query.text")
    validqueries = []
    querycounter = 0
    for q in qrys:
        validqueries.append(int(q))

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load("index_file.pickle")
    #    print("index loaded")
    cf = CranFile('cran.all')
    #QueryProcessor.numberofresult =10
    #qp = QueryProcessor(qrys,loadiindex,cf.docs,10)
    queryRelevence = dict()
    for line in open(queryrefilename):

        fields = line.split(" ")
        fields[0] = '%0*d' % (3, int(fields[0]))
        if fields[0] in queryRelevence:
            # and let's extract the data:
            queryRelevence[fields[0]].append(fields[1])
        else:
            # create a new array in this slot
            queryRelevence[fields[0]] = [fields[1]]
    replacecounter = 0
    queryRelevenceUpdated = {}
    for k in queryRelevence:

        queryRelevenceUpdated['%0*d' % (3, int(
            validqueries[replacecounter]))] = queryRelevence.get(k)
        replacecounter = replacecounter + 1

#  relevent = list(queryRelevence.keys())
# relevent = list(map(int, relevent))
#samplespace = np.intersect1d(relevent, validqueries)
    list_of_random_items = random.sample(validqueries, numberofrandomqueries)
    tempcounter2 = 0
    booleanndcg = []
    vectorndcg = []

    while tempcounter2 < numberofrandomqueries:

        list_of_random_items[tempcounter2] = '%0*d' % (
            3, int(list_of_random_items[tempcounter2]))
        print('query for which ndcg is calculated ' +
              str(list_of_random_items[tempcounter2]))
        y = str(list_of_random_items[tempcounter2])
        vectorresult = query(indexfilename, '1', queryfilename,
                             str(list_of_random_items[tempcounter2]), 10)
        #       vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665']
        #       print(vectorresult)
        tempcounter = 0
        for z in vectorresult:

            if z in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                vectorresult[tempcounter] = 1
            else:
                vectorresult[tempcounter] = 0

            tempcounter = tempcounter + 1
        #print(vectorresult)
        idealvectorresult = vectorresult.copy()
        idealvectorresult.sort(reverse=True)
        #print(idealvectorresult)
        if sum(idealvectorresult) == 0:
            ndcgscore = 0
        else:
            ndcgscore = ndcg_score(idealvectorresult, vectorresult)
    # print(ndcgscore)
        vectorndcg.append(ndcgscore)
        tempcounter3 = 0

        booleanqueryresult = query(indexfilename, '0', queryfilename,
                                   str(list_of_random_items[tempcounter2]), 10)
        #booleanqueryresult = ['462','462','462','462','462','462','462','462','462']
        booleanquery = booleanqueryresult.copy()
        for g in booleanquery:

            if g in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                booleanquery[tempcounter3] = 1
            else:
                booleanquery[tempcounter3] = 0

            tempcounter3 = tempcounter3 + 1
        #print(booleanquery)
        tempcounter4 = len(booleanquery)
        while tempcounter4 < 10:
            booleanquery.append(0)
            tempcounter4 = tempcounter4 + 1
        idealbooleanresult = []
        for i in range(0, 10):
            if i < len(queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]):
                idealbooleanresult.append(1)
            else:
                idealbooleanresult.append(0)

        idealbooleanresult.sort(reverse=True)
        if sum(booleanquery) == 0:
            ndcgscoreboolean = 0
        else:
            ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult)
        booleanndcg.append(ndcgscoreboolean)
        tempcounter2 = tempcounter2 + 1
    print('P value for all the queries processed is:')
    print(
        scipy.stats.wilcoxon(vectorndcg,
                             booleanndcg,
                             zero_method='wilcox',
                             correction=False))
    print('Done')
Exemple #12
0
    qp.preprocessing()
    if (processing_algorithm == 0):
        qp.booleanQuery()
    else:
        qp.vectorQuery(3)


def getDoc(qrys):
    myDoc = []
    for doc in qrys:
        myDoc.append(qrys[doc].text)
    return myDoc


if __name__ == '__main__':
    qrys = loadCranQry('query.text')  #loadCranQry('query.text')
    # query.text is retrieved from loadCranQry
    invertedInd = InvertedIndex()
    index = invertedInd.load("index_file.pickle")  # sys.argv[1]
    # arg 1 in command line is pickle file
    # qr = QueryProcessor(query, index)
    coll = getDoc(qrys)
    #qr = QueryProcessor(qrys, index, coll)
    qr = QueryProcessor(qrys, index)
    qr.preprocessing('009')

    alg = '0'  # sys.argv[2]
    # arg 2 is 0 for bool, 1 for vector
    if (alg == '0'):
        qr.booleanQuery()
    else:
Exemple #13
0
def eval(index_file, query_file, qrels_File, number_of_queries):
    #read queryfile,indexfile
    # ToDo
    queries = loadCranQry(query_file)
    queries_id_list = [str(int(x)) for x in queries.keys()]
    #print(queries_id_list)
    #read querls.txt
    qrels_dict = process_querls_file(qrels_File, queries_id_list)
    inputdocument = cran.CranFile("cran.all")
    # load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    qp = QueryProcessor(queries, index, inputdocument, number_of_queries)
    queries_id_list_int = [int(x) for x in qrels_dict.keys()]
    queries_id_ls = [int(x) for x in queries.keys()]
    #IdeaVectorsforQuery_ids={}
    sumbooleanNADC = []
    sumvectorNADC = []
    with open('Evaluation_search.csv', 'w') as f:
        f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel",
                                   "AverageNDCG-vectorModel", "P-value"))
        for i in range(0, 5):
            vectorNADC = []
            booleanNADC = []
            intersection_queries = list(
                set(queries_id_list_int) & set(queries_id_ls))
            random_query_id_list = random.sample(queries_id_list_int,
                                                 number_of_queries)
            #random_query_id_list=[153, 18]
            #print(random_query_id_list)
            for q_id in random_query_id_list:
                print("Processing for Query ID ::", q_id)
                qp.querynumber = q_id
                #boolean_res=qp.booleanQuery()
                vector_top3 = qp.vectorQuery(5)
                #vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
                #print(boolean_res)
                print("Output for Vector Model Result::", vector_top3)
                if (vector_top3.__len__() < 1):
                    vectorNADC.append(0)
                else:
                    vector_label = [x[0] for x in vector_top3]
                    score = [x[1] for x in vector_top3]
                    print("DocumentIDs of Vector Model Result:: ",
                          vector_label)
                    print("Scores of Vector Model Result::", score)
                    true_label = vector_label.copy()
                    query_id = str(q_id)
                    for x in vector_label:
                        #str_x="{0:0=3d}".format(x)
                        ind = vector_label.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual Vector:: ", true_label)
                    print("Predicted Vector:: ", score)
                    if sum(true_label) == 0:
                        vectorNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Vector::", ndcg)
                        vectorNADC.append(ndcg)
                boolean_res = qp.booleanQuery()
                print("output of boolean_res:: ", boolean_res)
                if boolean_res.__len__() < 1:
                    booleanNADC.append(0)
                else:
                    score = [1] * len(boolean_res)
                    if (score.__len__() < 5):
                        leng = 5 - (score.__len__())
                        score.extend([0] * leng)
                    true_label = boolean_res.copy()
                    query_id = str(q_id)
                    for x in boolean_res:
                        ind = boolean_res.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual boolean:: ", true_label)
                    print("Predicted boolean:: ", score)
                    if sum(true_label) == 0:
                        booleanNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Boolean::", ndcg)
                        booleanNADC.append(ndcg)
            print("Calculated NADC sum for all queries", vectorNADC)
            avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries)
            print("Calculated NADC sum for all queries", booleanNADC)
            avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries)
            print("Avergae NADC Vector::", avergae_vectorNADC)
            print("Avergae NADC boolean::", avergae_booleanNADC)
            p_value = scipy.stats.wilcoxon(vectorNADC,
                                           booleanNADC,
                                           zero_method='wilcox',
                                           correction=False)
            print(i, str(avergae_booleanNADC), str(avergae_vectorNADC),
                  str(p_value[1]))
            p = "%.20f" % float(str(p_value[1]))
            print('P value for all the queries processed is:', p)
            f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC),
                                       str(avergae_vectorNADC), str(p)))
    print('Done')
def eval():

    # Algorithm:
    # Pick N random samples from query.txt
    # Get top 10 results from bool query for each rnd query
    # Get top 10 results from vector query for each rnd query
    # Compute NDCG btn bool query results and qrels.txt
    # Compute NDCG btn vector query results and qrels.txt
    # Get p-value btn bool and vector

    # Get the query collection
    qc = loadCranQry(query_path)
    poss_queries = list(qc)

    # Load up the inverted index
    ii = InvertedIndex()
    ii.load(index_file)

    # Load up the document collection
    cf = CranFile("cran.all")

    # Get ground-truth results from qrels.txt
    with open(qrels_path) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    # Run over N random queries, collecting NDCGs
    bool_ndcgs = []
    vector_ndcgs = []
    for _ in range(n):
        # Get random query ID
        query_id = choice(poss_queries)

        # Get the query
        if 0 < int(query_id) < 10:
            query_id = '00' + str(int(query_id))
        elif 9 < int(query_id) < 100:
            query_id = '0' + str(int(query_id))
        try:
            query = qc[query_id].text
        except KeyError:
            print("Invalid query id", query_id)
            return

        # Initialize the query processor
        qp = QueryProcessor(query, ii, cf)

        # Run bool query
        bool_result = qp.booleanQuery()[:10]

        # Run vector query
        vector_result = qp.vectorQuery(10)

        # Pull top 10 ground-truth results from qrels dict
        gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10]

        # Compute NDCG for bool query
        # NOTE: There is no weighting on the bool query, so give all an even 1
        truth_vector = list(map(lambda x: x in gt_results, bool_result))
        bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector),
                               k=len(truth_vector))

        # Compute NDCG for vector query
        vector_docs = []
        vector_scores = []
        for v in vector_result:
            vector_docs.append(v[0])
            vector_scores.append(v[1])
        truth_vector = list(map(lambda x: x in gt_results, vector_docs))
        vector_ndcg = ndcg_score(truth_vector,
                                 vector_scores,
                                 k=len(truth_vector))

        # Accumulate NDCGs
        bool_ndcgs.append(bool_ndcg)
        vector_ndcgs.append(vector_ndcg)

    # Average out score lists
    bool_avg = 0
    for bool in bool_ndcgs:
        bool_avg += bool
    bool_avg /= len(bool_ndcgs)

    vector_avg = 0
    for vector in vector_ndcgs:
        vector_avg += vector
    vector_avg /= len(vector_ndcgs)

    # Present averages and p-values
    print("Boolean NDCG average:", bool_avg)
    print("Vector NDCG average:", vector_avg)
    if n > 19:
        print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue)
    else:
        print("Wilcoxon p-value: Sample size too small to be significant")
    print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def to_ndcg(qrels, q_text, idx_file, tk=10, n=2):
    column_names = ['qid', 'docid', 'bool_rel', 'vec_rel'
                    ]  #for creating a dataframe for easier data manupilation
    #df_qrels = pd.read_csv('../CranfieldDataset/qrels.text', names=column_names, sep=' ')   #can test by hard-coding
    df_qrels = pd.read_csv('../CranfieldDataset/qrels.sample',
                           names=column_names,
                           sep=' ')  #can test by hard-coding
    #df_qrels = pd.read_csv(qrels, names=column_names, sep=' ')
    #print df_qrels

    unique_qids = list(set(list(df_qrels.qid.values)))
    random.shuffle(unique_qids)
    random_qids = unique_qids[0:n]

    qrys = cranqry.loadCranQry('../CranfieldDataset/query.text'
                               )  #qrys is a dict---for hard-coded testing
    #qrys =  cranqry.loadCranQry(q_text)  #qrys is a dict

    qrys_ids = [key for key, val in qrys.iteritems()]

    II = index.InvertedIndex()
    index_file = II.load("index_file.json")  #for hard-coded testing
    #index_file = II.load(idx_file)

    vec_agg_ndcg, bool_agg_ndcg = list(), list(
    )  #for storing aggregate ndcg scores
    for qid in random_qids:
        print qid
        df_qid = df_qrels[
            df_qrels["qid"] ==
            qid]  #dataframe for one query id---comparison of an integer qid in a string qid

        qid_docids = list(
            df_qid['docid']
        )  #list of doc ids for a randomly chosen query id from qrels.text---to be used for ndcg_score
        print qid_docids

        st_qid = str(
            qid
        )  #very important----the decimal number in random_qids should be matched the octal numbers in the cranfield dataset

        if len(st_qid) == 1:  #for handing decimal to octal qid conversion
            st_qid = "00" + st_qid
        elif len(st_qid) == 2:
            st_qid = "0" + st_qid
        else:
            st_qid = st_qid

        if st_qid in qrys_ids:
            qp = QueryProcessor(qrys[st_qid].text, index_file, 'cran.all')

            bool_array = qp.booleanQuery()
            vec_array = qp.vectorQuery(10)  #change back to 'tk'
            print bool_array
            bool_array = [int(v) for v in bool_array]
            print bool_array
            #ndcg for boolean model
            bool_list = [(0, 0)] * 10  #change back to tk

            idx = 0
            for doc_id in bool_array:
                if doc_id in qid_docids:  #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid)
                    #y_true[idx] = 1
                    bool_list[idx] = (1, 1)
                    idx += 1
                else:
                    bool_list[idx] = (0, 1)
                if idx == 10:
                    break
            #print bool_list

            y_true = [int(bool_id[0]) for bool_id in bool_list]
            y_score = [int(bool_id[1]) for bool_id in bool_list]
            print "bool", y_true
            print "bool", y_score

            bool_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10))

            #ndcg for vector model
            print vec_array
            y_score = [
                vec_id[1] for vec_id in vec_array
            ]  #y_score--to be passed to ndcg_score is the list of cosine similarity scores
            vec_ids = [
                int(vec_id[0]) for vec_id in vec_array
            ]  #list of docids from the list of tuples of the form (docid, similarity_score)
            #print vec_ids
            y_true = [0] * 10  ##added on 0317---change back to tk
            idx = 0
            for doc_id in vec_ids:
                if doc_id in qid_docids:  #iteratively check if a docid returned by the vector model is present in qrels.text for the specific query(qid)
                    y_true[idx] = 1
                    idx += 1
            print "vec", y_true
            print "vec", y_score
            vec_agg_ndcg.append(metrics.ndcg_score(y_true, y_score, 10))

            del qp  ##garbage collection

    return bool_agg_ndcg, vec_agg_ndcg
Exemple #16
0
        qp.booleanQuery()
    else:
        qp.vectorQuery(3)

if __name__ == '__main__':

    #index_file = str(sys.argv[1]) #index_file.pickle
    #algo = int(sys.argv[2]) # 0
    #query_text = str(sys.argv[3]) #query.text
    #queryId = str(sys.argv[4]) # '009'

    index_file = "index_file.pickle"
    algo = 0
    query_text = "query.text"
    queryId = '009'
    qrys = loadCranQry(query_text)
    invertedInd = InvertedIndex()
    #loading the indexed doucment file

    index = invertedInd.load(index_file)

    #no need to use below one
    #coll = getDoc(qrys)

    #query(alogo, qrys, index, queryId)

    qr = QueryProcessor(qrys, index)
    qr.preprocessing(queryId)
    #There are two types of queries
    # 1. is booleanQuery, 2. vectoryQuery
Exemple #17
0
def eval():
    qf = cranqry.loadCranQry(queryfile)

    print('Done')
Exemple #18
0
def test(index_loc, cran_loc, qrels_loc):
    ''' test your code thoroughly. put the testing cases here'''

    ##### SETUP ITEMS #####

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_loc)

    # Get the document collection
    cf = CranFile(cran_loc)

    # Get ground-truth results from qrels.txt
    with open(qrels_loc) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    ##### INITIAL TEST ITEMS #####
    print("TESTS BASED ON SUGGESTED TESTING POINTS")

    # Ensure tf is correct
    #   Find a random word and check TF value against what is manually done
    posting_list = ii.find("experiment").posting
    tf_vector = []
    for posting in posting_list:
        tf_vector.append(len(posting_list[posting].positions) \
            == posting_list[posting].term_freq())
    print("TF is computed correctly:", all(tf_vector))

    # Ensure idf is correct
    print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \
        == ii.idf("experiment"))

    # As both tf and idf are correct, and tf-idf is a product of the two,
    #   it is reasonable to assume tf-idf is computed correctly

    ##### BOOL QUERY TESTS #####

    # Here, I use very specific boolean queries to ensure that a
    #   limited number of documents are returned
    print("\nBOOL QUERY TESTS")

    # Ensure that the exact title of doc 8 matches for doc 8
    doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition"
    qp1 = QueryProcessor(doc8, ii, cf)
    print("Bool query matches on exact title:", qp1.booleanQuery() == [8])

    # Ensure that bool query matches very specific AND query
    qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf)
    print(
        "Bool query matches on specific AND query ('hugoniot and infinitesimally'):",
        qp2.booleanQuery() == [329])

    # Test that an OR query is handled properly
    #   Both gravel and stagnation have completely distinct postings lists.
    #   OR should merge them.
    gravel_postings = ii.find("gravel").sorted_postings[:]
    stag_postings = ii.find("stagnat").sorted_postings[:]
    gravel_postings.extend(stag_postings)
    qp3 = QueryProcessor("gravel or stagnation", ii, cf)
    print("Bool query successfully handles OR ('gravel or stagnation'):",
          qp3.booleanQuery() == sorted(gravel_postings))

    # Test that NOT is handled properly
    #   The posting list for "diameter" is a subset of "slipstream" postings
    #   (oddly enough). To test this works, do "slipstream and not diameter"
    #   and we chould get slipstream's postings minus those of diameter.
    slip_postings = ii.find("slipstream").sorted_postings[:]
    diam_postings = ii.find("diamet").sorted_postings[:]
    slip_not_diam = [t for t in slip_postings if t not in diam_postings]
    print("Bool query successfully handles NOT ('slipstream and not diameter'):",
        QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \
          == slip_not_diam)

    # Ensure AND/OR order doesn't matter
    print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery())
    print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):",
        QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery())

    # Ensure that the presence of parens does not change query results
    print("Bool query can handle query regardless of parens ('slipstream and diameter'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery())

    # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries
    print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):",
        QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery())
    print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):",
        QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery())

    # Ensure parentheses properly group items
    #   Tested by doing the query "manually" by adding/orring the correct terms
    part_one = QueryProcessor("conduction and cylinder and gas", ii,
                              cf).booleanQuery()
    part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery()
    part_one.extend(part_two)
    expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery()
    expected_result.extend(part_one)
    print("Bool query parens successfully group conflicting operators:",
        QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \
          == sorted(list(set(expected_result))))

    ##### VECTOR QUERY TESTS #####

    # For this, just ensure that most of the results are in the expected list
    print("\nVECTOR QUERY TESTS")

    # Ensure vector query can match on exact title
    print("Vector query matches on exact title:",
          qp1.vectorQuery(1)[0][0] == 8)

    # Try a few example queries from query.text
    #   As long as one-fifth of t-10 are in gt_result, call it a pass
    # Note that queries with larger answer sets were chosen to
    #   ensure there were enough to get to one-fifth of ten
    qc = loadCranQry("query.text")
    poss_queries = list(qc)

    # Query 001
    result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("001") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 001:",
          sum(correct_vector) > 2)

    # Query 128
    result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("128") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 128:",
          sum(correct_vector) > 2)

    # Query 226
    result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("226") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 226:",
          sum(correct_vector) > 2)

    # Query 196
    result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("196") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 196:",
          sum(correct_vector) > 2)

    # Query 291
    result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("291") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 291:",
          sum(correct_vector) > 2)
Exemple #19
0
def VectorCompare():
     queries = loadCranQry("query.text")
     queries_id_list=[str(int(x)) for x in queries.keys()]
     inputdocument = cran.CranFile("cran.all")
     # load the index file saved at from part 1
     index = InvertedIndex().load("index_file")
     qp = QueryProcessor(queries, index, inputdocument, 10)
     queries_id_list=[str(int(x)) for x in queries.keys()]
     #print(queries_id_list)
     #read querls.txt
     qrels_dict=process_querls_file("qrels.text",queries_id_list)
     #IdeaVectorsforQuery_ids={}
     sumbooleanNADC=[]
     sumvectorNADC=[]
     vectorNADC1 = []
     booleanNADC2 = []
     # random_query_id_list=[153, 18]
     # print(random_query_id_list)
     query_id = [4 , 29, 53, 58, 100]
     vectorNADC1=[]
     vectorNADC2=[]
     for q_id in query_id:
         qp.querynumber = q_id
         # boolean_res=qp.booleanQuery()
         vector_top3 = qp.vectorQuery(5)
         vector2_top3=qp.vectorQuery(5,True)
         # vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
         # print(boolean_res)
         print("Output for Vector Model Result::", vector_top3)
         if (vector_top3.__len__() < 1):
             vectorNADC1.append(0)
         else:
             vector_label = [x[0] for x in vector_top3]
             score = [x[1] for x in vector_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC1.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC1.append(ndcg)
         if (vector2_top3.__len__() < 1):
             vectorNADC2.append(0)
         else:
             vector_label = [x[0] for x in vector2_top3]
             score = [x[1] for x in vector2_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC2.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC2.append(ndcg)
     print("Calculated NADC sum for all queries", vectorNADC1)
     avergae_vectorNADC = float(sum(vectorNADC1) / 5)
     print("Calculated NADC sum for all queries", vectorNADC2)
     avergae_vectorNADC2 = float(sum(vectorNADC2) / 5)
     print("Avergae NADC Vector::", avergae_vectorNADC)
     print("Avergae NADC boolean::", avergae_vectorNADC2)
     print(vectorNADC1)
     print(vectorNADC2)
     p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False)
     p = "%.20f" % float(str(p_value[1]))
     print('P value for all the queries processed is:', p)
Exemple #20
0
import cran
import query
from cranqry import loadCranQry
from index import InvertedIndex, test
from query import QueryProcessor

print("***************Test Cases Running for Index File****************")
invertedobj = InvertedIndex()
test(invertedobj)

print("***************Test Cases Running for Query File****************")
# load documents
inputdocument = cran.CranFile("cran.all")
# load the index file saved at from part 1
index = InvertedIndex().load("index_file")
# load query processed files
queries = loadCranQry("query.text")

qp = QueryProcessor(queries, index, inputdocument, 29)
query.test(qp)

qp = QueryProcessor(queries, index, inputdocument, 29)
qp.vectorQuery(3)
def eval(index_file, query_text, qrels, n):
    qrys = cranqry.loadCranQry(query_text)
    queries = {}
    for q in qrys:
        queries[q] = qrys[q].text
    query_ids = list(queries.keys())
    query_ids.sort()
    query_ids_ints = []
    for k in range(0, len(query_ids)):  # generating n random queries
        query_ids_ints.append(int(query_ids[k]))
    set1 = set()
    while len(set1) != n:
        set1.add(random.choice(query_ids_ints))
    selected_queries = list(set1)
    docs = set()
    qrels = {}

    f = open("qrels.text", "r")  # parsing relevant queries(qrels.text)
    l = f.readline()
    while l:
        j = l.split(" ")
        if query_ids_ints[int(j[0]) - 1] in qrels.keys():
            qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1]))
        else:
            qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])]
        l = f.readline()
    cranqryobj = cranqry.loadCranQry(query_text)
    dict_query = {}
    for q in cranqryobj:
        dict_query[int(q)] = cranqryobj[
            q].text  # matching queries in query.text and qrels.text
    indexObject = index.InvertedIndex()
    items = indexObject.load(index_file)
    vector_ndcg_score = {}
    vector_score_dict = {}
    for q in selected_queries:
        print(q)
        query_raw = dict_query[q]
        QPobj = QueryProcessor(query_raw, items, index_file)
        QPobj.preprocessing()
        result_list = QPobj.vectorQuery(
            10)  # fetching first 10 documents for a query using vector model
        boolean_result_list = QPobj.booleanQuery()
        print("Boolean query result : ", boolean_result_list
              )  # fetching documents for a query using booleanQuery
        ndcg_boolean = 0
        truth_list = qrels[q]
        boolean_output_list = []
        rank_doc_list = list(map(lambda x: int(x[0]), result_list))
        print("Relavant documents for this query : ",
              truth_list)  # relavant documents for the query
        print("Vector model result : ",
              rank_doc_list)  # documents result list for vector model
        vector_score_list = []
        for id in boolean_result_list:  # calculating the predicted scores for boolean model
            if int(id) in truth_list:
                boolean_output_list.append(1)
            else:
                boolean_output_list.append(0)
        boolean_score_list = []
        if len(boolean_score_list) < 10:
            boolean_score_list = boolean_output_list
            while len(boolean_score_list) != 10:
                boolean_score_list.append(0)
        elif len(boolean_score_list) > 10:
            for i in range(0, 10):
                boolean_score_list[i] = boolean_output_list[i]
        for id in rank_doc_list:  # calculating the predicted scores for vector model

            if id in truth_list:
                vector_score_list.append(1)
            else:
                vector_score_list.append(0)
        vector_score_dict[q] = vector_score_list
        truth_score_list = []
        for i in range(
                0, len(vector_score_list)
        ):  # calculating the ground_truth scores for vector model
            truth_score_list.append(vector_score_list[i])
        truth_score_list.sort(reverse=True)

        boolean_truth_score_list = []
        for i in range(
                0, len(boolean_score_list)
        ):  # calculating the ground_truth scores for boolean model
            boolean_truth_score_list.append(boolean_score_list[i])
        boolean_truth_score_list.sort(reverse=True)
        print("Vector model ground_truth list is:\n", truth_score_list)
        print("Vector ranking score list is:\n", vector_score_list)
        print("Boolean model ground_truth list is:\n",
              boolean_truth_score_list)
        print("Boolean model score list is:\n", boolean_score_list)
        vector_ndcg_score[q] = [
            ndcg_score(np.array(boolean_truth_score_list),
                       np.array(boolean_score_list)),
            ndcg_score(np.array(truth_score_list), np.array(vector_score_list))
        ]
    vector_list = [
    ]  # compute ndcg score for boolean and vector models for all the randomly generated queries
    boolean_list = []
    for qu in vector_ndcg_score:
        vector_list.append(vector_ndcg_score[qu][1])
        boolean_list.append(vector_ndcg_score[qu][0])

    print("ndcg score of boolean and vector models for all the queries:\n",
          vector_ndcg_score)
    print("ndcg scores list for boolean model for all the queries:\n",
          boolean_list)
    print("ndcg scores list for vector model for all the queries:\n",
          vector_list)
    p_value_wilcoxon = stats.wilcoxon(
        np.array(boolean_list), np.array(vector_list)
    )  # calculating p value using wilcoxon test and ttest  for boolean and vector models  p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False)
    print("wilcoxon test p value is:", p_value_wilcoxon[1])
    print("ttest p value is :", p_value_ttest[1])