def get_scoring(scoring_measure):
    foo = scoring.Frequency()

    if scoring_measure == "TF_IDF":
        foo = scoring.TF_IDF()
    if scoring_measure == "BM_25":
        foo = scoring.BM25F()
    if scoring_measure == "TF":
        foo = scoring.Frequency()
    return foo
Exemple #2
0
    def scoring_results(self, input_query, number_of_results):
        ix = index.open_dir(self.path + '/' + str(self.analyzer))
        #check the scoring parameter and set the scoring_function accordingily
        if self.scoring is 'frequency':
            scoring_function = scoring.Frequency()
        elif self.scoring is 'tf_idf':
            scoring_function = scoring.TF_IDF()
        elif self.scoring is 'bm25f_1':
            scoring_function = scoring.BM25F(B=0.35, K1=0.7)
        elif self.scoring is 'bm25f_2':
            scoring_function = scoring.BM25F(B=0.75, K1=1.2)
        elif self.scoring is 'bm25f_3':
            scoring_function = scoring.BM25F(B=0.75, K1=2.3)
        elif self.scoring is 'bm25f_4':
            scoring_function = scoring.BM25F(B=0.9, K1=1.1)
        else:
            print('scoring method not found')

        qp = QueryParser("content", ix.schema)
        persed_query = qp.parse(input_query)  # parsing the query
        searcher = ix.searcher(weighting=scoring_function)
        # execute the search
        results = searcher.search(persed_query, limit=number_of_results)
        rr = []
        rank = 0
        # loop over search results
        for hit in results:
            rank += 1
            rr.append([hit['indx'], rank])

        # close searcher
        searcher.close()
        # return list of tuples (docID, rank)
        return (rr)
Exemple #3
0
def ranking(topic_id, p, index, model="TF-IDF"):
    topic_id = int(topic_id) - 101  # Correct topic identifier to start at 0
    if model == "TF-IDF":
        weighting = scoring.TF_IDF()
    elif model == "TF":
        weighting = scoring.Frequency()
    elif model == "BM25":
        weighting = scoring.BM25F()
    else:
        raise ValueError(
            "Invalid scoring model: please use 'TF', 'TF-IDF' or 'BM25'")

    topic = process_topic(topic_id, topic_directory)[1]

    if stemming:
        analyzer = StemmingAnalyzer(stoplist=set(stopwords.words("english")))
    else:
        analyzer = StandardAnalyzer(stoplist=set(stopwords.words("english")))

    tokens = [token.text for token in analyzer(topic)]
    string_query = ' '.join(tokens)
    with index.searcher(weighting=weighting) as searcher:
        q = QueryParser("content", index.schema,
                        group=OrGroup).parse(string_query)
        results = searcher.search(q, limit=p)
        return [(r["doc_id"], round(r.score, 4)) for r in results]
Exemple #4
0
def search(ix, term, algorithm, pageRank=False):
    if algorithm == 'BM25F':
        print("BM25F Search")
        with ix.searcher() as searcher:
            query = QueryParser('content', ix.schema).parse(term)
            results = searcher.search(query, limit=1000)

            return makeResults(results, pageRank, 25)
        return

    if algorithm == 'Frequency':
        print("Frequency Search")
        with ix.searcher(weighting=scoring.Frequency()) as searcher:
            query = QueryParser('content', ix.schema).parse(term)
            results = searcher.search(query, limit=1000)

            return makeResults(results, pageRank, 25)
        return

    if algorithm == 'TF_IDF':
        print("TF_IDF Search")
        with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
            query = QueryParser('content', ix.schema).parse(term)
            results = searcher.search(query, limit=1000)

            return makeResults(results, pageRank, 25)
        return

    print('Incorrect Alogrithm')
Exemple #5
0
def query_evaluator(file_directory,query_tsv_name,result_limit, *notitle):
    '''
    This function is used to evaluate all queries by the 24 different Search Engine.
    :param file_directory: directory where the indexes are stored ("C:./Cranfield_DATASET/" and "C:./Time_DATASET/" )
    :param query_tsv_name: Queries.tsv names (cran_Queries.tsv, time_Queries)
    :param result_limit: the number of top k results retrieved
    :param notitle: if notitle, it means only the body is considered (this is used for Time.csv dataset)
    :return: SEs = that is a dictionary where the keys are the Search Engines (ex. SE_01, SE_02, ..., SE_24) and the values their query document results
    '''


    SEs = defaultdict(list) # dictionary where all the SEs will be stored
    query_path = file_directory+query_tsv_name # query_path = path where there are the queries ["C:./Cranfield_DATASET/cran_Queries.tsv", "C:./Time_DATASET/time_Queries.tsv"]
    
    analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer',
                      'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer',  'LanguageAnalyzer'] # analyzers names
        
    counter = 1 # counter used to name the SEs
    for analyzer in analyzer_names: 

        index_directory = file_directory+'inverted_index_'+analyzer #get the directory where the index is stored

        ix = index.open_dir(index_directory) # open the index inside the chosen directory
        scoring_functions = [scoring.TF_IDF(),scoring.Frequency(),scoring.BM25F(B=0.75,K1=1.2)] # list of chosen scoring functions

        # per each index three different scoring functions are used:
        for score in scoring_functions:

            scoring_function = score # select the scoring function

            if notitle: #this is fot Time dataset because only the body will be considered
                # query parser
                qp = QueryParser("body", ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field.
            else: # this is for the Cranfield dataset because both title and body will be considered
                # query parser
                qp = MultifieldParser(["title","body"], ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field.


            # Create the searcher for the index based on the predefined scoring function
            searcher = ix.searcher(weighting=scoring_function)

            with open(query_path) as tsvfile: # here the .tsv containing the query is used and one by one are parsed
                querys = csv.reader(tsvfile, delimiter='\t')
                header = next(querys) # check if there is the header 
                if header != None: # if there is the header iterate over all the rows in the Query.tsv file (cran_Queries.tsv, time_Queries)
                    for query in querys:
                        parsed_query = qp.parse(query[1])  # parsing the query (because up to now, the query is just a python string, and it has to be interpreted by the program. Because up to now it's just a boolean operator)
                        results = searcher.search(parsed_query, limit=result_limit) # here the query is performed and only the top "result_limit" will be considered

                        for hit in results:
                            '''
                            here the relevant results will be selected. In particular:
                            Query number, Doc ID, Rank and Score
                            '''
                            output = [query[0],hit['id'], str(hit.rank + 1), str(hit.score)]
                            SEs['SE_'+str(counter)].append(output) # the results are added to the predefined dictionary
            print('analyzer: '+analyzer, 'scoring_function: '+str(scoring_function).split('.')[2].split(' ')[0], '('+str(counter)+')')
            counter +=1
    return(SEs)
Exemple #6
0
def make_search_service(search_text):
  charmap = charset_table_to_dict(default_charset)
  custom_analyzers = StemmingAnalyzer()

  index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir')
  myindex = open_dir(index_path)
  qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8})
  qstring = search_text
  q = qp.parse(qstring)

  results_list = []

  myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0))
  with myindex.searcher(weighting=myWeighting) as s:
    results = s.search(q, limit=30, terms=True)

    #forse cercavi e risultati relativi a
    corrected = s.correct_query(q, qstring)
    did_you_mean = str
    result_for = str
    if corrected.query != q:
      if len(results) < 1:
        results = s.search(qp.parse(corrected.string), limit=30, terms=True)
        result_for = corrected.string
      else:
        did_you_mean = corrected.string


    #query expansion
    keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)]
    if not keywords and keywords == " ":
      query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords))
      results_keyword = s.search(query_keyword, limit=30, terms=True)
      results.upgrade_and_extend(results_keyword)

    #sorting
    key_sort = lambda result: result.score
    results = sorted(results, key=key_sort, reverse=True)

    
    for ris in results:
      result = {}
      result['title'] = ris['title']
      result['url'] = ris['url']
      result['id'] = ris['ID']
      result['highlight'] = ris.highlights("textdata")
      results_list.append(result)


    #per calcolo precisione e recall
    id_results = [ris['id'] for ris in results_list[:10]]

    return {
      'search_text': search_text,
      'results': results_list, 
      'did_you_mean': did_you_mean,
      'result_for': result_for,
      'results_ids': id_results
    }
Exemple #7
0
def search(ix, term):
    bm25f = {}
    print "BM25F"
    with ix.searcher() as searcher:
        query = QueryParser('content', ix.schema).parse(term)
        results = searcher.search(query, limit=25)
        index = 1

        for r in results:
            print r['title']
            bm25f[r['title']] = index
            index += 1

    freq = {}
    print "\nFrequency"
    with ix.searcher(weighting=scoring.Frequency()) as searcher:
        query = QueryParser('content', ix.schema).parse(term)
        results = searcher.search(query, limit=25)
        index = 1

        for r in results:
            print r['title']
            freq[r['title']] = index
            index += 1

    tf_idf = {}
    print "\nTF_IDF"
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        query = QueryParser('content', ix.schema).parse(term)
        results = searcher.search(query, limit=25)
        index = 1

        for r in results:
            print r['title']
            tf_idf[r['title']] = index
            index += 1

    print '\nBM25F vs Frequency'
    difference = 0
    for page in bm25f:
        if page in freq:
            difference += abs(bm25f[page] - freq[page])
    print difference

    print '\nBM25F vs TF_IDF'
    difference = 0
    for page in bm25f:
        if page in tf_idf:
            difference += abs(bm25f[page] - tf_idf[page])
    print difference

    print '\nTF_IDF vs Frequency'
    difference = 0
    for page in tf_idf:
        if page in freq:
            difference += abs(tf_idf[page] - freq[page])
    print difference
Exemple #8
0
 def generate_score(self, query, measure, k=None):
     '''Generate scores for a given query according to a given measure'''
     if (measure == 'bm25'):
         score = self.rank(query, weighting=scoring.BM25F(), k=k)
     elif (measure == 'cos'):
         score = self.rank(query, weighting=scoring.TF_IDF(), k=k)
     elif (measure == 'freq'):
         score = self.rank(query, weighting=scoring.Frequency(), k=k)
     return score
Exemple #9
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
    def init_env(self):
        from whoosh import qparser, query, scoring
        from whoosh.analysis import RegexTokenizer
        from whoosh.lang.morph_en import variations

        self.freq_searcher = self.idx.searcher(weighting=scoring.Frequency())
        self.tfidf_searcher = self.idx.searcher(weighting=scoring.TF_IDF())
        self.bm25_searcher = self.idx.searcher(
            weighting=scoring.BM25F(B=0.74, K1=1.52))
        self.query_parser = QueryParser('abstract', self.idx.schema)
        self.query_parser.add_plugin(FuzzyTermPlugin())
        self.title_parser = QueryParser('title', self.idx.schema)
        self.title_parser.add_plugin(FuzzyTermPlugin())
        self.tokenizer = RegexTokenizer()
Exemple #11
0
def search_result(input_query, query, searchtype, directory):
    if searchtype == 'BM25':
        #logic
        return search(input_query, query,
                      os.path.join(settings.MEDIA_ROOT, directory),
                      scoring.BM25F())
    elif searchtype == 'TFIDF':
        #logic
        return search(input_query, query,
                      os.path.join(settings.MEDIA_ROOT, directory),
                      scoring.TF_IDF())
    elif searchtype == 'TF':
        #logic
        return search(input_query, query,
                      os.path.join(settings.MEDIA_ROOT, directory),
                      scoring.Frequency())
Exemple #12
0
def test_dismax():
    schema = fields.Schema(id=fields.STORED,
                           f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, f1=u("alfa bravo charlie delta"),
                   f2=u("alfa alfa alfa"),
                   f3=u("alfa echo foxtrot hotel india"))
    w.commit()

    with ix.searcher(weighting=scoring.Frequency()) as s:
        assert_equal(list(s.documents(f1="alfa")), [{"id": 1}])
        assert_equal(list(s.documents(f2="alfa")), [{"id": 1}])
        assert_equal(list(s.documents(f3="alfa")), [{"id": 1}])

        qs = [Term("f1", "alfa"), Term("f2", "alfa"), Term("f3", "alfa")]
        dm = DisjunctionMax(qs)
        r = s.search(dm)
        assert_equal(r.score(0), 3.0)
Exemple #13
0
    def test_dismax(self):
        schema = fields.Schema(id=fields.STORED,
                               f1=fields.TEXT,
                               f2=fields.TEXT,
                               f3=fields.TEXT)
        st = RamStorage()
        ix = st.create_index(schema)

        w = ix.writer()
        w.add_document(id=1,
                       f1=u"alfa bravo charlie delta",
                       f2=u"alfa alfa alfa",
                       f3=u"alfa echo foxtrot hotel india")
        w.commit()

        s = ix.searcher(weighting=scoring.Frequency())
        qs = [Term("f1", "alfa"), Term("f2", "alfa"), Term("f3", "alfa")]
        r = s.search(DisjunctionMax(qs))
        self.assertEqual(r.score(0), 3.0)
        r = s.search(DisjunctionMax(qs, tiebreak=0.5))
        self.assertEqual(r.score(0), 3.0 + 0.5 + 1.5 + 0.5)
def test_all():
    domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot")]
    schema = fields.Schema(text=fields.TEXT)
    storage = RamStorage()
    ix = storage.create_index(schema)
    w = ix.writer()
    for _ in xrange(100):
        w.add_document(text=u(" ").join(choice(domain)
                                      for _ in xrange(randint(10, 20))))
    w.commit()

    # List ABCs that should not be tested
    abcs = ()
    # provide initializer arguments for any weighting classes that require them
    init_args = {"MultiWeighting": ([scoring.BM25F()],
                                    {"text": scoring.Frequency()}),
                 "ReverseWeighting": ([scoring.BM25F()], {})}

    for wclass in _weighting_classes(abcs):
        try:
            if wclass.__name__ in init_args:
                args, kwargs = init_args[wclass.__name__]
                weighting = wclass(*args, **kwargs)
            else:
                weighting = wclass()
        except TypeError:
            e = sys.exc_info()[1]
            raise TypeError("Error instantiating %r: %s" % (wclass, e))

        with ix.searcher(weighting=weighting) as s:
            try:
                for word in domain:
                    s.search(query.Term("text", word))
            except Exception:
                e = sys.exc_info()[1]
                e.msg = "Error searching with %r: %s" % (wclass, e)
                raise
Exemple #15
0
 def test_all(self):
     domain = [u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot"]
     schema = Schema(text=TEXT)
     storage = RamStorage()
     ix = storage.create_index(schema)
     w = ix.writer()
     for _ in xrange(100):
         w.add_document(text=u" ".join(choice(domain) for i in xrange(randint(10, 20))))
     w.commit()
     
     # provide initializer arguments for any weighting classes that require them
     init_args = {"MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()})}
     
     reader = ix.reader()
     for wclass in self._weighting_classes():
         if wclass.__name__ in init_args:
             args, kwargs = init_args[wclass.__name__]
             weighting = wclass(*args, **kwargs)
         else:
             weighting = wclass()
         searcher = Searcher(reader, weighting)
         
         for word in domain:
             r = searcher.search(query.Term("text", word))
Exemple #16
0
from suggestCorrections import suggestCorrections
from printResult import printResult

if __name__ == "__main__":
    #ix = open_dir('Indexes')
    query = raw_input("Please Enter the query to search for:")
    input_query = query_AND(query, './Indexes/stopWordsWithoutStemming')
    suggestCorrections(input_query, query,
                       './Indexes/stopWordsWithoutStemming')

    print "BM25 Results"
    result = search(input_query, query, './Indexes/stopWordsWithoutStemming',
                    scoring.BM25F())
    printResult(result, query)

    print "Phrasal Query Results"
    result = search(query_phrasal(query,
                                  './Indexes/stopWordsWithoutStemming'), query,
                    './Indexes/stopWordsWithoutStemming', scoring.BM25F())
    printResult(result, query)

    print "TF_IDF Results"
    result = search(input_query, query, './Indexes/stopWordsWithoutStemming',
                    scoring.TF_IDF())
    printResult(result, query)

    print "TF Results"
    result = search(input_query, query, './Indexes/stopWordsWithoutStemming',
                    scoring.Frequency())
    printResult(result, query)
Exemple #17
0
def main():
    args = parse_args()
    query = args.query
    number = args.number
    rank_func = args.rank_func
    index_loc = args.index_loc
    B = args.B
    weight_B = args.weight_B
    K1 = args.K1

    if query is None:
        query_list = read_query()
    else:
        temp_str = ' '
        query = temp_str.join(query)
        query_list = [query]

    if index_loc is None:
        index_loc = 'index'

    if weight_B is not None:
        rank_func = 1

    if rank_func == 1:
        B1, B2, B3, B4, B5 = get_B(weight_B)
        weighting = scoring.BM25F(B=B,
                                  K1=K1,
                                  title_B=B1,
                                  body_B=B2,
                                  category_B=B3,
                                  date_B=B4,
                                  rating_B=B5)
        rank_name = 'bm25f'
    elif rank_func == 2:
        weighting = scoring.TF_IDF()
        rank_name = 'tf-idf'
    elif rank_func == 3:
        weighting = scoring.Frequency()
        rank_name = 'frequency'
    else:
        weighting = scoring.BM25F(B=B, K1=K1)
        rank_name = 'bm25'

    ix = open_dir(index_loc)

    with ix.searcher(weighting=weighting) as searcher:
        # parser = QueryParser(schema=ix.schema)
        parser = MultifieldParser(
            ['title', 'body', 'category', 'date', 'rating'], schema=ix.schema)
        for this_query in query_list:
            que = parser.parse(this_query)
            print('\n')
            print('--', this_query)
            results = searcher.search(que, limit=number)
            if len(results) == 0:
                print(' ')
                print('no matched result. please try again.')
            else:
                for hit in results:
                    print(' ')
                    print('#', hit.rank, rank_name, 'score:',
                          round(hit.score, 5))
                    print('title:', hit['title'])
                    print('date:', hit['date'], 'rating:', hit['rating'],
                          'category:', hit['category'])
                    print('body:', hit['body'])
Exemple #18
0
    indexDir = basePath + "index/"
    docDir = basePath + "docs/"
    benchmarkPath = basePath + "benchmark-data.json"

    if arg["search"]:
        query = arg['<query>']
        spellCheck = not arg["--no-spell-check"]
        batch = arg["--batch"]
        limit = parseLimit(arg["--limit"])
        wildcard = not arg["--no-wildcard"]

        if arg['--tf-idf']:
            modelScoring = scoring.TF_IDF()
        elif arg['--freq']:
            modelScoring = scoring.Frequency()
        else:
            modelScoring = scoring.BM25F()

        if spellCheck:
            fixed = correct(query)
            if fixed != query:
                if batch or confirm("Did you mean: `" + fixed + "`?"):
                    query = fixed

        print("Searching for `" + query + "`")
        index = openIndex(indexDir)
        r = search(query, index, modelScoring, limit, wildcard)

        for result in r:
            with open(result["path"], "r") as f:
Exemple #19
0
 def generate_scores(self, query, k=None):
     '''Generate scores for a given query according to BM25, TF IDF (under a cosine similarity) and Frequency rank functions'''
     bm25 = self.rank(query, weighting=scoring.BM25F(), k=k)
     cos = self.rank(query, weighting=scoring.TF_IDF(), k=k)
     freq = self.rank(query, weighting=scoring.Frequency(), k=k)
     return bm25, cos, freq
Exemple #20
0
                                                          analyzer_names[i],schema)
            sw1_utils_schema.Fill_Empty_Schema(datasets[idx],directory_containing_the_index,\
                              datasets_len[idx])
            dir_idx_list.append(directory_containing_the_index)

###
### Open the Index
###

# for each of the datasets open index. And for each of the scoring functions
# create a searcher, process queries and save retrieved results into Q_results.
# Compute the MRR value for each of the search engine configurations
Q_Res = []
config_names = []
# scoring functions
sc_functions = [scoring.Frequency(), scoring.TF_IDF(), scoring.BM25F()]
sc_fun_name = ['Frequency', 'TF_IDF', 'BM25F']

for idx in range(len(datasets)):
    max_number_of_results = datasets_len[idx]
    print('Search Engine Configuration' + "\t" + "\t" + "\t" + 'MRR')
    if datasets[idx] == 'Cranfield_DATASET':
        Q_dict = Cran_Q
        GT_dict = Cran_GT
        for elem in dir_idx_list[:3]:
            directory_containing_the_index = elem
            ix = index.open_dir(directory_containing_the_index)
            ### Select a Scoring-Function
            for s in range(len(sc_functions)):
                scoring_function = sc_functions[s]
                ### Create a Searcher for the Index with the selected Scoring-Function
Exemple #21
0
import os
import torch
if __name__ == '__main__':
    Score = "bm25"  # bm25, tfidf, tf
    Pivots_N = 10    # number of plausible English mentions
    Search_N = 500  # number of searched entities for each plausible English mention
    InputIndexDir = "data_process/DBIndex2"
    input_data_file = "Release/output_toy_de.json"
    output_data_file = "Release/output_toy_de_search.json"
    #------------------------------------------------------
    if Score == "bm25":
        myscore = scoring.BM25F()
    elif Score =="tfidf":
        myscore = scoring.TF_IDF()
    elif Score == "tf":
        myscore = scoring.Frequency()
    elif Score == "multi":
        myscore = scoring.MultiWeighting(scoring.BM25F(), id=scoring.Frequency(), keys=scoring.TF_IDF())
    else:
        myscore = scoring.BM25F()

    #---------------Input Query----------------------
    schema = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                    content=TEXT(stored=True))
    All_Result = []
    ix = open_dir(InputIndexDir)
    sf = torch.nn.Softmax(dim=0)
    alldata = read_json(input_data_file)
    with ix.searcher(weighting=myscore) as searcher:
        parser = QueryParser("title", ix.schema,group=qparser.OrGroup)
        for item in tqdm(alldata):
Exemple #22
0
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \
                  only_title_flag = 0, \
                  directory_containing_the_index  = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \
                  query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \
                  gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \
                  doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \
                  conf_label = "Not Specified",
                  mrr_eps = .32, \
                  k_interval_for_nDCG = range(1,151)):
   
    
    ###
    ### Create a Schema 
    ###
    schema = Schema(id=ID(stored=True), \
                    title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer))
    
    ###
    ### Create an empty-Index 
    ### according to the just defined Schema ;)
    ### 
    ix = create_in(directory_containing_the_index, schema)
    
    
    ###
    ### Get the query set (reset index due to missing values in the IDs)
    ###
    query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index()
    
    
    ###
    ### Get the ground truth (little manipulation to group by query and allign IDs)
    ###
    gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t")
    gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict()
    gt = defaultdict(list)
    j = 1
    for i in range(len(gt_tmp)):
        while(gt[i] == []):
            try:
                gt[i] = gt_tmp[j]
                j+=1
            except KeyError:
                j += 1
    
    
    
    number_of_queries = len(query_set)
    num_of_docs = 1400
    
    ###
    ### We'll iterate on the following lists to swicth SE scoring function and get their names
    ###
    scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()]
    scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list]
    
    
    ###
    ### Fill the Index
    ###
    writer = ix.writer()
    for doc in range(num_of_docs):
        id_ = str(doc+1)
        title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html")
        writer.add_document(id=id_, title = title, content = content)
    writer.commit()
    
    
    
    ###
    ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config
    ###
    results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)])
    
   
    evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries
    ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs

    ###
    ### Run the SEs
    ###
    for idx_s,scorer in enumerate(scoring_functions_list):
        for idx,query in enumerate(query_set["Query"]):
            
            input_query = query
            
            ###
            ### Select a Scoring-Function
            ###
            scoring_function = scorer
            
            ###
            ### Create a QueryParser for 
            ### parsing the input_query based on user SE choosen configuration.
            ###
            if multifield_flag:
                qp = MultifieldParser(["title","content"], ix.schema)
                parsed_query = qp.parse(input_query)# parsing the query
            else:
                if only_title_flag:
                    qp = SimpleParser("title", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                else:
                    qp = SimpleParser("content", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                
            ###
            ### Create a Searcher for the Index
            ### with the selected Scoring-Function 
            ###
            searcher = ix.searcher(weighting=scoring_function)
            
            ###
            ### Perform a Search and store results
            ###
            results = searcher.search(parsed_query, limit=max_res)
            results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results]
            searcher.close()
        mrr_res = mrr(results_mat[:,:,idx_s],gt)
        
        if mrr_res >= mrr_eps:
            
            ###
            ### Compute and summarize R-precision distro
            ###
            r_res = r_precision(results_mat[:,:,idx_s],gt)
            mean = np.mean(list(r_res.values()))
            first_q = np.percentile(list(r_res.values()),25)
            third_q = np.percentile(list(r_res.values()),75)
            median = np.median(list(r_res.values()))
            minr = min(list(r_res.values()))
            maxr = max(list(r_res.values()))
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr]
            
            ###
            ### Compute nDCG@k for varying k and for each scoring function
            ###
            for k in k_interval_for_nDCG:
                tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values()))
                ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res)
            
        else:
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res]
        
        ###
        ### Just to see what's happening
        ###
        print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res))
        
    return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs 
Exemple #23
0
    def __init__(self):
        try:
            self.ix = index.open_dir(INDEX_BASE_DIR)
        except Exception as e:
            logger.error("Could not open index file: %s" % e)
            logger.info(
                "To be able to search, an index has to be created first. Use index_website.py to create the index."
            )
            raise e

        self.scorers_dict = {
            SearchEngine.FREQUENCY:
            scoring.Frequency(),
            SearchEngine.BM25:
            scoring.BM25F(),
            SearchEngine.TF_IDF:
            scoring.TF_IDF(),
            SearchEngine.PL2:
            scoring.PL2(),
            SearchEngine.PAGERANK:
            scoring.Frequency(),
            # Change the scoring with the custom scoring, once implemented
            SearchEngine.CUSTOM:
            scoring.MultiWeighting(
                default=scoring.BM25F(),
                # content=scoring.PL2(),
                # content_stem=scoring.PL2()
            )
        }

        self.rankings = self.scorers_dict.keys()
        self.qp = MultifieldParser([
            "title_stem", "description_stem", "keywords_stem", "content_stem"
        ],
                                   schema=schema)

        recall = 1
        precision = 2
        fieldboosts = {
            "title": 2.0,
            "description": 1.3,
            "keywords": 1.5,
            "links_in_keywords": 1.5,
            "content": 1.0,
            "title_stem": 1.2,
            "description_stem": 1.1,
            "keywords_stem": 1.2,
            "links_in_keywords_stem": 1.1,
            "content_stem": 1.0
        }

        total_standard = sum([
            value for key, value in fieldboosts.items()
            if not key.endswith('_stem')
        ])
        total_stem = sum([
            value for key, value in fieldboosts.items()
            if key.endswith('_stem')
        ])

        for key, value in fieldboosts.items():
            if key.endswith('_stem'):
                fieldboosts[key] = (fieldboosts[key] /
                                    total_stem) * (recall /
                                                   (recall + precision))
            else:
                fieldboosts[key] = (fieldboosts[key] /
                                    total_standard) * (precision /
                                                       (recall + precision))

        self.qp_custom = MultifieldParser([
            "title", "description", "keywords", "links_in_keywords", "content",
            "title_stem", "description_stem", "keywords_stem",
            "links_in_keywords_stem", "content_stem"
        ],
                                          schema=schema,
                                          fieldboosts=fieldboosts)