Ejemplo n.º 1
0
 def getScores(self,query):
     w = TF_IDF(self.Index)
     model = Vectoriel(self.Index,True, w)
     P, Succ, Index_P, Counter_Index_P, N_pgs = select_G_q(self.n, self.K, query, model, self.Index)
     hts = Hits(N_pgs)
     hts.randomWalk(P, Succ, Index_P)
     return hts.get_result(Counter_Index_P)
Ejemplo n.º 2
0
 def getScores(self, query):
     w = TF_IDF(self.Index)
     model = Vectoriel(self.Index, True, w)
     P, Succ, Index_P, Counter_Index_P, N_pgs = select_G_q(
         self.n, self.K, query, model, self.Index)
     pr = PageRank(N_pgs, self.d)
     A = get_A(P, Succ, N_pgs)
     pr.randomWalk(A)
     return pr.get_result(Counter_Index_P)
Ejemplo n.º 3
0
def test_weighter():
    parser = ParserCACM()
    textRepresenter = PorterStemmer()
    fname = "data/cacm/cacm.txt"
    I = Index(parser, textRepresenter)
    I.indexation(fname)
    weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
    for i, w in enumerate(weighters):
        print "Test of weighter" + str(i)
        print "getDocWeightsForDoc"
        print w.getDocWeightsForDoc("20")
        print "getDocWeightsForStem"
        print w.getDocWeightsForStem("accelerat")
        print "getDocWeightsForQuery"
        print w.getWeightsForQuery(I.getTfsForDoc("20"))
Ejemplo n.º 4
0
def initModels(I, modelType):
    """Init Models of type modelType or load if already computed"""

    model_file_name = modelType + '.p'

    sys.stdout.write("Creating models...")
    sys.stdout.flush()

    if os.path.isfile(model_file_name):
        models = pickle.load(open(model_file_name, "rb"))

    elif modelType == "Vectoriel":
        weighters = [Binary(I), TF(I), TF_IDF(I), Log(I), Log_plus(I)]
        models = [Vectoriel(Index, True, w) for w in weighters]
        pickle.dump(models, open(model_file_name, "wb"))

    else:
        print "Unknown model type ABORT THE MISSION"

    sys.stdout.write("Done!\n")

    return models
Ejemplo n.º 5
0
    def __init__(self,
                 N,
                 index_file,
                 query_file,
                 relevance_file,
                 model_type="Vectoriel",
                 div_K=None,
                 div_N=None,
                 eval_N=20):
        """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """
        self.N = eval_N
        self.Index = initIndex(index_file)

        if model_type == "Vectoriel":
            self.models = [Vectoriel(Index, True, Log_plus(self.Index))
                           ]  #initModels(self.Index,model_type)
        elif model_type == "Euclidian_model":
            self.models = [Euclidian_model(self.Index, Log_plus(self.Index))]

        elif model_type == "Language":
            print "Init of Language model"
            self.models = [LanguageModel(self.Index, 0.2)]

        elif model_type == "Okapi":
            self.models = [Okapi(self.Index)]

        elif model_type == "PageRank":
            self.models = [RankModel(self.Index)]

        elif model_type == "Hits":
            self.models = [HitsModel(self.Index)]

        elif model_type == "KMeans_diversity":
            self.models = [KMeans_diversity(self.Index, div_K, div_N)]

        elif model_type == "Greedy_diversity":
            self.models = [Greedy_diversity(self.Index, div_K, div_N)]

        elif model_type == "Greedy_diversity_euclidian":
            print "alpha, N:", div_K, div_N
            self.models = [
                Greedy_diversity_euclidian(self.Index, alpha=div_K, N=div_N)
            ]

        elif model_type == "MetaModel":
            """Learning a linear combination of 4 models"""
            I = self.Index
            w1 = TF_IDF(I)
            model1 = Vectoriel(I, True, w1)
            w2 = Log_plus(I)
            model2 = Vectoriel(I, True, w2)
            #w3 = Log(I)
            #model3 = Vectoriel(I,True, w3)

            model3 = Okapi(I)

            f1 = FeaturerModel(I, model1)
            f2 = FeaturerModel(I, model2)
            f3 = FeaturerModel(I, model3)
            #f4 = FeaturerModel(I,model4)

            listFeaturers = FeaturerList([f1, f2, f3])  #,f4])
            metamodel = MetaModel(listFeaturers, I, query_file, relevance_file)
            metamodel.train()
            self.models = [metamodel]

        print type(self.models[0])
        self.query_file = query_file
        self.relevance_file = relevance_file
        self.query_parser = GroundTruthParser(self.query_file,
                                              self.relevance_file)
Ejemplo n.º 6
0
    def __init__(self, index_file, query_file, relevance_file,model_type="Vectoriel"):
        """ model_type = Vectoriel | Okapi | Language | PageRank | MetaModel """

        self.Index = initIndex(index_file)
        
        if model_type  == "Vectoriel":
            self.models = initModels(self.Index,model_type)
            
        elif model_type == "Language":
            print "Init of Language model"
            self.models = [LanguageModel(self.Index,0.2)]
            
        elif model_type == "Okapi":
            self.models = [Okapi(self.Index)]
            
        elif model_type == "PageRank":
            self.models = [RankModel(self.Index)]
            
        elif model_type == "Hits":
            self.models = [HitsModel(self.Index)]
            
        elif model_type == "MetaModel":
            """Learning a linear combination of 4 models"""
            I = self.Index
            
            w1 = TF_IDF(I)
            model1 = Vectoriel(I,True, w1)
            
            w2 = Log_plus(I)
            model2 = Vectoriel(I,True, w2)
            
            w3 = Binary(I)
            model3 = Vectoriel(I,True, w3)
            
            w4 = TF(I)
            model4 = Vectoriel(I,True, w4)
            
            model5 = Okapi(I)
            
            model6 = LanguageModel(I,0.2)
            
            model7 = RankModel(I,n=5, K=100,d=.85)
            
            f1 = FeaturerModel(I,model1)
            f2 = FeaturerModel(I,model2)
            f3 = FeaturerModel(I,model3)
            f4 = FeaturerModel(I,model4)
            f5 = FeaturerModel(I,model5)
            f6 = FeaturerModel(I,model6)
            f7 = FeaturerModel(I,model7)
            
            
            listFeaturers = FeaturerList([f1,f2,f3,f4,f5,f6,f7])
            metamodel = MetaModel(listFeaturers,I,query_file,relevance_file)
            metamodel.train()
            self.models = [metamodel]
        elif model_type == "Random":
            self.models = [RandomModel(self.Index)]
        else:
            pass
        
        print type(self.models[0])    
        self.query_file = query_file
        self.relevance_file = relevance_file
        self.query_parser = QueryParser(self.query_file, self.relevance_file)  
Ejemplo n.º 7
0
    fname = "data/cacm/cacm.txt"

    sys.stdout.write("Indexing database...")
    sys.stdout.flush()
    if os.path.isfile('Index.p'):
        I = pickle.load(open("Index.p", "rb"))

    else:
        parser = ParserCACM()
        textRepresenter = PorterStemmer()
        I = Index(parser, textRepresenter)
        I.indexation(fname)
        I.parser = None
        pickle.dump(I, open("Index.p", "wb"))

    w1 = TF_IDF(I)
    model1 = Vectoriel(I, True, w1)
    w2 = Log_plus(I)
    model2 = Vectoriel(I, True, w2)
    w3 = Log(I)
    model3 = Vectoriel(I, True, w3)

    model4 = Okapi(I)
    queryExample = {'techniqu': 1, 'accelerat': 1}

    f1 = FeaturerModel(I, model1)
    print "\ndone building f1"
    f2 = FeaturerModel(I, model2)
    print "\ndone building f2"
    f3 = FeaturerModel(I, model3)
    print "\ndone building f3"
Ejemplo n.º 8
0
#    pr.randomWalk(A)
#    mu = pr.get_mu()
#
#    hts = Hits(N_pgs,N_iters=100)
#    hts.randomWalk(A, P, Succ, Index_P, Counter_Index_P)
#
#    print "MAX mu ",max(mu)
#    print "MAX a ",max(hts.get_a())

#### Graph relevant to query
    n = 500
    K = 1000
    q = None

    #o = Okapi(I)
    w = TF_IDF(I)
    o = Vectoriel(I, True, w)
    queryExample = {'techniqu': 1, 'accelerat': 1}
    P, Succ, Index_P, Counter_Index_P, N_pgs = select_G_q(
        n, K, queryExample, o, I)
    pr = PageRank(N_pgs, d)
    A = get_A(P, Succ, N_pgs)
    print P
    pr.randomWalk(A)
    mu = pr.get_result(Counter_Index_P)
    print "MAX mu ", max(mu)
    print "Npages", N_pgs, pr.N_pages
    print "Random walk for HITS"
#    hts = Hits(N_pgs,N_iters=100)
#    hts.randomWalk(P, Succ, Index_P)
#    a = hts.get_result(Counter_Index_P)