def Test_avgP():
    start = time()

    collection = "cisi"
    q = "1"
    model = "Vectorial"

    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    query = queries[q]
    # display the content of the query
    # print ( Weighter.Ponder2( index1 ).getWeightsForQuery ( query.getText() ))

    print("testing ", model, " Model !")
    print("collection : ", collection)
    m = getattr(IRModel, model)(Weighter.Ponder2(index1))
    retrieved_docs = m.getRanking(query.getText(), scoring=False)
    avgP = EvalMesure.avgP().evalQuery(retrieved_docs, query)
    print(" AvgP : ", avgP)

    end = time()
    print("executing time : ", end - start)
def testWeighter():
    t = time()
    index1 = SimpleIndex("data/cacm/ex1.txt")
    index1.indexing()

    doc = "B"  #"1"
    stem = "discover"  #"class"
    query = "hello world discover"  #"program algebra "

    p = Weighter.Ponder1(index1)
    print("Ponder 1 :")
    print(p.getWeightsForDoc(doc))
    print("--------------------------")
    print(p.getWeightsForQuery(query))
    print("--------------------------")
    print(p.getWeightsForStem(stem))
    print("--------------------------")
    print("####################################")

    p = Weighter.Ponder2(index1)
    print("Ponder 2 :")
    print(p.getWeightsForDoc(doc))
    print("--------------------------")
    print(p.getWeightsForQuery(query))
    print("--------------------------")
    print(p.getWeightsForStem(stem))
    print("--------------------------")
    print("####################################")

    p = Weighter.Ponder3(index1)
    print("Ponder 3 :")
    print(p.getWeightsForDoc(doc))
    print("--------------------------")
    print(p.getWeightsForQuery(query))
    print("--------------------------")
    print(p.getWeightsForStem(stem))
    print("--------------------------")
    print("####################################")

    p = Weighter.Ponder4(index1)
    print("Ponder 4 :")
    print(p.getWeightsForDoc(doc))
    print("--------------------------")
    print(p.getWeightsForQuery(query))
    print("--------------------------")
    print(p.getWeightsForStem(stem))
    print("--------------------------")
    print("####################################")

    p = Weighter.Ponder5(index1)
    print("Ponder 5 :")
    print(p.getWeightsForDoc(doc))
    print("--------------------------")
    print(p.getWeightsForQuery(query))
    print("--------------------------")
    print(p.getWeightsForStem(stem))
    print("--------------------------")
    print("####################################")

    print("\nexecuting time :", round(time() - t, 2), " seconds .")
def testIndex():
    t = time()
    '''
    - Default parameters for the constructor SImpleIndex:
        --> lower=True , stemm=True , stopwords=True
    - you can change it to test other initial values !
    '''
    index1 = SimpleIndex("data/cacm/ex1.txt",
                         stopwords=False,
                         stemm=True,
                         lower=True)
    index1.indexing()

    #params

    doc_test = "B"
    stem_test = "discover"

    print("\nstring of the doc '", doc_test, "' is : ",
          index1.getStrDoc(doc_test))
    print("\nTFs for document '", doc_test, "' ",
          index1.getTfsForDoc(doc_test))
    print("\nTFIDFs for document '", doc_test, "' ",
          index1.getTfIDFsForDoc(doc_test))
    print("#########################################################")

    #TERM
    print("\nTFs for stem '", stem_test, "' ", index1.getTfsForStem(stem_test))
    print("\nIDF for stem '", stem_test, "' ", index1.getIDFforStem(stem_test))
    print("\nTFIDFs for stem '", stem_test, "' ",
          index1.getTfIDFsForStem(stem_test))

    print("\nexecuting time :", round(time() - t, 2), " seconds .")
    print(len(index1.index_occ_normalized))
def Test_Recall_Precision_Fmesure():
    start = time()

    q = "15"
    model = "ModeleLangue"

    collection = "cisi"
    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    query = queries[q]

    # display the content of the query
    # print ( Weighter.Ponder2( index1 ).getWeightsForQuery ( query.getText() ))

    print("testing ", model, " Model !")
    print("collection : ", collection)
    m = getattr(IRModel, model)(Weighter.Ponder2(index1))
    retrieved_docs = m.getRanking(query.getText(), scoring=False)
    print(retrieved_docs[:10])
    R = EvalMesure.Recall().evalQuery(retrieved_docs, query)
    P = EvalMesure.Precision().evalQuery(retrieved_docs, query)
    F = EvalMesure.F_mesure().evalQuery(retrieved_docs, query)
    print(" Rappel : ", R)
    print(" Precision : ", P)
    print(" F_mesure : ", F)

    end = time()
    print("executing time : ", end - start)
def testOkapiBM25():
    index1 = SimpleIndex("data/cisi/cisi.txt")
    index1.indexing()
    m = IRModel.Okapi(Weighter.Ponder2(index1))
    query = "computer algebra "
    print("Scores : ", m.getScores(query))
    print("Ranking : ", m.getRanking(query))
def testLangModel():
    t = time()
    index1 = SimpleIndex("data/cacm/cacmShort-good.txt")
    index1.indexing()
    query = "hello world world world  line line algebra "
    m = IRModel.ModeleLangue(Weighter.Ponder2(index1))
    print("Scores : ", m.getScores(query))
    print("Ranking : ", m.getRanking(query))
    print("\nexecuting time :", round(time() - t, 2), " seconds .")
def testVectorialModel():
    t = time()
    index1 = SimpleIndex("data/cacm/ex1.txt")
    index1.indexing()
    m = IRModel.Vectorial(Weighter.Ponder2(index1), normalize=True)
    query = "hello world world world  line line algebra "
    print("Scores : ", m.getScores(query))
    print("Ranking : ", m.getRanking(query))
    print("\nexecuting time :", round(time() - t, 2), " seconds .")
def FullPageRank():

    start = time()
    index1 = SimpleIndex("data/cacm/cacm.txt")
    index1.indexing()
    m = IRModel.Vectorial(Weighter.Ponder2(index1))
    q = "computer coding Algebraic"
    '''
    default parameters :
        d =0.85 , aj=1 , Eps = 0.001 , max_iter = 20 ,  n=10 ,  k=5
    '''
    res = PageRank.page_rank(q, m, index1.getCollection(), Eps=1e-7)
    print(res)

    end = time()
    print("executing time : ", end - start)
def evalModelsTest():
    start = time()

    collection = "cisi"

    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    m1 = getattr(IRModel, "Vectorial")(Weighter.Ponder2(index1))
    m2 = getattr(IRModel, "ModeleLangue")(Weighter.Ponder2(index1))
    m3 = getattr(IRModel, "Okapi")(Weighter.Ponder2(index1))
    print("collection : ", collection)
    # 5 queries
    ls = [
        queries[k] for k in queries if len(queries[k].get_relevant_doc()) != 0
    ][:15]
    res = evalIRmodel.EvalIRModel.evalModels(models=[m1, m2, m3], queries=ls)

    print("\nReporting ")
    print("--------------")
    print("shape confirmation ", res.shape)
    print("\nVectorial :")
    print(tabulate(res[0], headers=['Precision', 'Recall', 'F1-score']))
    print("Mean :", np.mean(res[0], axis=0), " Standard Deviation : ",
          np.std(res[0], axis=0))

    print("\nModeleLangue :")
    print(tabulate(res[1], headers=['Precision', 'Recall', 'F1-score']))
    print("Mean :", np.mean(res[1], axis=0), " Standard Deviation : ",
          np.std(res[1], axis=0))

    print("\nOkapi BM-25:")
    print(tabulate(res[2], headers=['Precision', 'Recall', 'F1-score']))
    print("Mean :", np.mean(res[2], axis=0), " Standard Deviation : ",
          np.std(res[2], axis=0))

    end = time()
    print("executing time : ", end - start)
Esempio n. 10
0
def TuningParams_CrossValidationTest():
    start = time()

    collection = "cisi"
    model = "Okapi"

    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    m = getattr(IRModel, model)(Weighter.Ponder2(index1))
    print("testing ", model, " Model !")
    print("collection : ", collection)
    ls = [
        queries[k] for k in queries if len(queries[k].get_relevant_doc()) != 0
    ]
    evalIRmodel.EvalIRModel.tuning_params \
    ( model=m , metric="MAP",method="cross_validation" ,
    queries=ls , grid_param={"k1":np.arange(1.2,1.4,0.1),"b":np.arange(0.8,1.1,0.1)} , param_method=5 )
    end = time()
    print("executing time : ", end - start)
Esempio n. 11
0
def Test_NDCG_mean():
    start = time()

    collection = "cisi"
    model = "Vectorial"

    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    m = getattr(IRModel, model)(Weighter.Ponder2(index1))
    print("testing ", model, " Model !")
    print("collection : ", collection)
    ls = [
        queries[k] for k in queries if len(queries[k].get_relevant_doc()) != 0
    ]
    ndcg_mean = EvalMesure.NDCG().evaluate(m, ls)
    print(" NDCG : ", ndcg_mean)

    end = time()
    print("executing time : ", end - start)
Esempio n. 12
0
def testMAP():
    start = time()

    collection = "cisi"
    model = "Okapi"

    index1 = SimpleIndex("data/" + collection + "/" + collection + ".txt")
    index1.indexing()
    queries = QueryParser.QueryParser.Parse(
        "data/" + collection + "/" + collection + ".qry",
        "data/" + collection + "/" + collection + ".rel")
    m = getattr(IRModel, model)(Weighter.Ponder2(index1))
    print("testing ", model, " Model !")
    print("collection : ", collection)
    #get queries that has at least one relevant doc  to evaluate it !
    ls = [
        queries[k] for k in queries if len(queries[k].get_relevant_doc()) != 0
    ]

    MAP = EvalMesure.MAP().evaluate(m, ls)
    print(" MAP : ", MAP)

    end = time()
    print("executing time : ", end - start)
Esempio n. 13
0
def Exo1PagerankTest():
    index1 = SimpleIndex("data/cacm/ex1.txt")
    index1.indexing()
    m = IRModel.Vectorial(Weighter.Ponder2(index1))
    q = "hello hello worlds"
    PageRank.page_rank(q, m, index1.getCollection())