Beispiel #1
0
def main(argv):
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    start = []
    end = []
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    if len(opts) != 4:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg

    # Indexer
    (Indexer(collectionFile, tokenizerType)).writeIndexToFile('index')

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    for query in queries:

        # Query Operations
        tokenizer.changeText(query)
        queryTerms = tokenizer.getTerms()

        
        # Searcher
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index')

        # Ranker
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            scores += [ranker.bm25(1.2, 0.75)]

        # End time (latency purpose)
        end.append(timer())

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start, end)
Beispiel #2
0
def main(argv):

    # ----------------------------------------- HANDLING PROGRAM INPUT -------------------------------------------------
    collectionFile = ''
    tokenizerType = ''
    queriesFile = ''
    rankType = ''
    storePos = ''
    proximity = ''
    try:
        opts, args = getopt.getopt(argv, "hf:t:q:r:p:b:", ["collectionFile=", "tokenizerType=", "queriesFilePath=",
                                                     "rankType=", "storePositions=", "proximityBoost="])
    except getopt.GetoptError:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    if len(opts) != 6:
        print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
              '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
        sys.exit()

    for opt, arg in opts:
        if opt == '-h':
            print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> '
                  '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> '
              '-b <proximityBoost: 0 - No, 1 - Yes>')
            sys.exit()
        elif opt in ("-f", "--collectionFile"):
            if not path.exists(arg):
                print('Incorrect path to collection file.')
                sys.exit()
            collectionFile = arg
        elif opt in ("-t", "--tokenizerType"):
            if arg != '0' and arg != '1':
                print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.')
                sys.exit()
            tokenizerType = arg
        elif opt in ("-q", "--queriesFilePath"):
            if not path.exists(arg):
                print('Incorrect path to queries file.')
                sys.exit()
            queriesFile = arg
        elif opt in ("-r", "--rankType"):
            if arg != '0' and arg != '1':
                print('Incorrect rank type. TF-IDF: 0, BM25: 1.')
                sys.exit()
            rankType = arg
        elif opt in ("-p", "--storePositions"):
            if arg != '0' and arg != '1':
                print('\nIncorrect store positions choice. No: 0, Yes: 1.')
                sys.exit()
            storePos = arg
        elif opt in ("-b", "--proximityBoost"):
            if arg != '0' and arg != '1':
                print('\nIncorrect proximity boost choice. No: 0, Yes: 1.')
                sys.exit()
            proximity = arg

    # ----------------------------------------------- INDEXER ----------------------------------------------------------
    indexer = Indexer(collectionFile, tokenizerType, True if storePos=='1' else False)

    start = timeit.default_timer()
    indexer.index()
    stop = timeit.default_timer()

    print('Indexing total time - {} tokenizer: {} min and {} seconds'.format("simple" if tokenizerType == "0" else "better", (stop - start)//60, (stop - start) % 60))

    f = open(queriesFile, 'r')
    queries = f.read().splitlines()
    f.close()

    scores = []

    if tokenizerType == '0':  # simple
        tokenizer = Tokenizer.SimpleTokenizer('')
    else:  # better
        tokenizer = Tokenizer.BetterTokenizer('')

    start_queries = []
    end_queries = []
    time_searcher = 0
    time_ranker = 0
    for query in queries:

        # --------------------------------------- QUERY OPERATIONS -----------------------------------------------------
        tokenizer.changeText(query)

        #queryTerms, queryTermsPositions = tokenizer.getTerms(withPositions=True if storePos == '1' else False)
        queryTerms = tokenizer.getTerms(withPositions=False)

        # ------------------------------------------- SEARCHER ---------------------------------------------------------
        start = timeit.default_timer()
        documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index', True if storePos == '1' else False)
        stop = timeit.default_timer()
        time_searcher = time_searcher + stop - start

        # -------------------------------------------- RANKER ----------------------------------------------------------'
        start = timeit.default_timer()
        ranker = Ranker(documentsInfo, avgDocLen)
        
        # Start time (latency purpose)
        start_queries.append(timer())
        # If rankType = 0 (tf-idf)
        if rankType == '0':
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.lnc_ltc(), queryTerms)]
            else:
                scores += [ranker.lnc_ltc()]
        # If rankType = 1 (BM25)
        else:
            # If proximity = 1 (Proximity Boost)
            if proximity == '1':
                scores += [ranker.proximity_boost(ranker.bm25(1.2, 0.75), queryTerms)]
            else:
                scores += [ranker.bm25(1.2, 0.75)]

        stop = timeit.default_timer()
        time_ranker = time_ranker + stop - start

        # End time (latency purpose)
        end_queries.append(timer())


    print('Searching time for all queries: {} min and {} seconds'.format(time_searcher // 60, time_searcher % 60))
    print('Ranking time for all queries: {} min and {} seconds'.format(time_ranker // 60, time_ranker % 60))

    # Evaluation
    Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start_queries, end_queries)