Beispiel #1
0
def read_queries(filename):
    in_ = open(filename, 'r')
    queries = in_.read()
    queries = queries.split("\n")

    ir = IR()

    for query in queries:
        target_doc_id = int(query[0:query.index(' ')])
        q = Query(query[query.index(' ') + 1:len(query)])

        rank = -1
        documents = ir.search(q)
        for i in range(0, len(documents)):
            if documents[i].id == target_doc_id:
                rank = i
                break
        documents = documents[0:15]

        divider = '\n==============================================================================\n'
        out_ = open('../tests/' + query + '.txt', 'w')

        result = (
            'target document is rank ' + str(rank + 1) + divider +
            divider.join([
                'DOC ID:\n\t' + str(doc.id) + '\n' + doc.fancy_str()
                for (doc, r) in zip(documents, range(1,
                                                     len(documents) + 1))
            ]))
        result += '\n\n\n'
        out_.write(result)
        out_.close()

    in_.close()
Beispiel #2
0
def main():
    print "Loading index..."
    start = time.time()
    ir = IR()
    done = time.time()
    elapsed = done - start
    print "Index loaded in " + str(elapsed) + " seconds"
    print "\n============================================================"
    print "======================== IR Machine ========================"
    print "============================================================"
    print "= A IR tool to query over the Reuters database.            ="
    print "= More details about the database at http://bit.ly/1F8AFcO ="
    print "= Source code avaliable at http://bit.ly/1mezIcN           ="
    print "= Authors:                                                 ="
    print "=     @Joao Gabriel Santiago Mauricio de Abreu             ="
    print "=     @Natalia Paola de Vasconcelos Cometti                ="
    print "=     @Victor Felix Pimenta                                ="
    print "= Since: 12/11/2015                                        ="
    print "============================================================\n"
    quit = False
    while not quit:
        k = 0
        input_ = raw_input("Type your query ('q' to quit, 'h' to help): ")
        if input_ == 'q':
            break
        elif input_ == 'h':
            _help()
            continue
        query = Query(input_)
        start = time.time()
        documents = ir.search(query)
        done = time.time()
        elapsed = done - start
        if len(documents) > 0:
            print "\n" + str(len(documents)) + " results found in " + str(
                elapsed) + " seconds:\n"
            _print(k, documents)

            while True:
                opt = raw_input(
                    "============================================================\n"
                    + "Type:\n" + " '+' -> more results\n" +
                    " <DOC#> -> print content\n" +
                    " 'e' -> export all results to a file\n" +
                    " 'r' -> query again\n" + " 'q' -> quit\n" +
                    " 'h' -> help\n" +
                    "============================================================\n"
                )
                if opt == '+':
                    k += 10
                    _print(k, documents)
                elif opt == 'e':
                    f = open(
                        "../dump/" + input_ + " (" +
                        datetime.datetime.fromtimestamp(
                            time.time()).strftime('%Y-%m-%d %H:%M:%S') +
                        ").txt", 'w')
                    result = '\n==============================================================================\n'.join(
                        [doc.fancy_str() for doc in documents])
                    result += '\n\n\n'
                    f.write(result)
                    f.close()
                elif opt == 'r':
                    break
                elif opt == 'q':
                    quit = True
                    break
                elif opt == 'h':
                    _help()
                else:
                    try:
                        idx = int(opt)
                        if idx > 0 and idx <= (k + 10):
                            print documents[int(opt) - 1].fancy_str()
                        else:
                            print "Error: Document number out of bounds!"
                    except ValueError:
                        print "Error: Invalid input!"

        else:
            print "No results found for '" + input_ + "'"