コード例 #1
0
    def run(self):

        self.tagList.clear()
        self.labelList.clear()

        self.status.config(text='Processing...')
        tag_str = self.tagsEntry.get()
        nb_str = self.nbEntry.get()

        # RETRIEVE INFORMATIONS
        self.tagList = tag_str.split(',')
        self.nb_images_wished = int(nb_str)

        # DEBUG TO CHECK IF THE PROGRAM GOT THE RIGHT DATA
        print('You asked for :', self.nb_images_wished, 'images')
        print('With the tags : ')
        for i in range(0, len(self.tagList)):
            print('\t-> ' + self.tagList[i])
            self.labelList.append(
                Label(self.tagsLabelFrame, self.tagList[i],
                      self.progression_color))
            self.labelList[i].pack(side='top')
            self.progressBarList.append(
                ttk.Progressbar(self.progressionFrame,
                                length=300,
                                value=0,
                                max=100))
            self.progressBarList[i].pack(side='top')
            # if i != (len(self.tagList) - 1):
            #     Space(self.progressionFrame, self.progression_color, 640, 10).pack(side='top')

        Retriever(self.tagList, self.nb_images_wished, self.progressBarList,
                  self.labelList).start()
コード例 #2
0
def task3b(model):
    output_directory = os.path.join(os.getcwd(), "output")
    stemmed_corpus = get_stemmed_corpus()
    f = open('cacm_stem.query.txt', 'r')
    stemmed_queries = f.readlines()
    f.close()

    I = Indexer.InvertedIndexer('')
    I.stemmed_indexer(stemmed_corpus)
    r = Retriever.Retriever('', I, os.getcwd())
    file_name = os.path.join(output_directory,
                             'task3b_' + model + '_stemmed.txt')
    f = open(file_name, 'w')
    query_no = [12, 13, 19, 23, 24, 25, 50]
    q_iter = 0
    for each_query in stemmed_queries:
        r.process_query(each_query)
        docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no[q_iter]) \
                    + " Q0 " \
                    + str(docs[i]) + ' ' \
                    + str((i + 1)) + " " \
                    + str(scores[i]) + " " \
                    + model + "\n")
        q_iter += 1
    f.close()
コード例 #3
0
ファイル: server.py プロジェクト: safirshahbaaz/Search_Engine
def my_form_post():

    query = request.form['text'].encode('utf-8') # contains the query submitted

    results = Retriever.runner(query)

    # keep result in the form of a string, should be fine for small number of results
    # make sure multiple results are concatenated by using <br> so html displays them in new lines
    
    return render_template("results.html", name = results)  #return what you want here
コード例 #4
0
def task3a(model, raw_corpus_directory):
    project_directory = os.getcwd()
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True)
    output_directory = os.path.join(project_directory, "output")

    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word
    r = Retriever.Retriever(
        corpus_directory, I, project_directory
    )  # create a Retriever class, which contains different retrieval model

    os.chdir(raw_corpus_directory)
    os.chdir(os.pardir)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()

    f_stop_words = open('common_words.txt', 'r')
    stop_words_list = f_stop_words.readlines()
    stop_words = [i.strip() for i in stop_words_list]
    f_stop_words.close()
    file_name = os.path.join(output_directory, 'task3a_' + model + '.txt')
    f = open(file_name, 'w')  # open file for writing results
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()

        r.process_query(query, stopped=True,
                        stopwords=stop_words)  # parse the query
        # r.clean_content(query)
        docs_and_scores = r.get_scores_for_docs(
            model, int(query_no))  # retrieve relevant documents

        # save results into appropriate file
        docs = docs_and_scores[0]
        scores = docs_and_scores[1]
        for i in range(100):
            f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i+1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
    f.close()
コード例 #5
0
# store argument into variables
query_file = ''
result_path = ''
for opt, val in optlist:
    if opt == '--query':
        query_file = val
    else:
        result_path = val

# extract feature from query image
query_img = cv2.imread(query_file)
feature_descriptor = FeatureDescriptor((8, 12, 3))
query_features = feature_descriptor.describe(query_img)

# perform retrieval through index file
'''retrieves db'''
retriever = Retriever()
retrieval_results = retriever.search(query_features, limit=10)

# display query image
cv2.imshow("Query", query_img)

# loop over results of retrieved top limit image
for (id, distance) in retrieval_results:
    result = cv2.imread(result_path + '/' + id + '.png')
    print result
    print result_path + '/' + id + '.png'
    cv2.imshow("Result", result)
    cv2.waitKey(0)
コード例 #6
0
        else:
            word_count[each_term] = k
    top_terms = sorted(word_count.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
    expanded_query_list = [i for i, j in top_terms][:len(query_terms) + n]
    expanded_query = " ".join(expanded_query_list)
    return expanded_query


current_directory = os.getcwd()

corpus_directory = os.path.join(current_directory, "processed_corpus")
I = Indexer.InvertedIndexer(corpus_directory)
I.ngram_indexer(1)
r = Retriever.Retriever(corpus_directory, I)

model = 'bm25'
# get the results from the previous runs (bm25 and tfidf)
file_name = 'task1_' + model + '.txt'
results_file_dir = os.path.join(current_directory, "task1")
results_file_dir = os.path.join(results_file_dir, file_name)

f = open(results_file_dir, 'r')
data = f.readlines()
f.close()
list_of_lines = []
for each_line in data:
    list_of_lines.append(
        each_line.split())  # contains parsed lines from the task1 output file
コード例 #7
0
ファイル: task1.py プロジェクト: lkodali16/Search-Engine
raw_corpus_directory = raw_input(
    "Enter the raw corpus directory (html files): ")
project_directory = os.getcwd()
output_directory = os.path.join(project_directory, "output")

# Parser (to process the raw corpus (no stopping))
p = Indexer.Parser()
corpus_directory = p.build_corpus(raw_corpus_directory)

# Indexer - Builds the inverted indexes for the processed corpus
I = Indexer.InvertedIndexer(corpus_directory)
I.ngram_indexer(1)  # builds a unigram indexes for each word

# Retriever - based on the model specified, this object can  be
#             used to get the results.
r = Retriever.Retriever(corpus_directory, I, project_directory)

# Get the queries from the given file
query_dic = {}  # stores the queries; key - query ID, token - query
os.chdir(project_directory)
f = open('cacm.query.txt', 'r')
soup = BeautifulSoup(f.read(), 'html.parser')
f.close()
for i in range(64):
    query_no = (soup.find('docno')).text.encode(
        'utf-8')  # extract query number and query
    (soup.find('docno')).decompose()
    query = (soup.find('doc')).text.encode('utf-8')
    (soup.find('doc')).decompose()
    query_dic[int(query_no)] = query
コード例 #8
0
def run_task(task, model, raw_corpus_directory):
    project_directory = os.getcwd()
    output_directory = os.path.join(project_directory, "output")

    # Parser (to process the raw corpus (no stopping))
    p = Indexer.Parser()
    corpus_directory = p.build_corpus(raw_corpus_directory)

    # Indexer - Builds the inverted indexes for the processed corpus
    I = Indexer.InvertedIndexer(corpus_directory)
    I.ngram_indexer(1)  # builds a unigram indexes for each word

    # Retriever - based on the model specified, this object can  be
    #             used to get the results.
    r = Retriever.Retriever(corpus_directory, I, project_directory)

    # Get the queries from the given file
    query_dic = {}  # stores the queries; key - query ID, token - query
    os.chdir(project_directory)
    f = open('cacm.query.txt', 'r')
    soup = BeautifulSoup(f.read(), 'html.parser')
    f.close()
    for i in range(64):
        query_no = (soup.find('docno')).text.encode(
            'utf-8')  # extract query number and query
        (soup.find('docno')).decompose()
        query = (soup.find('doc')).text.encode('utf-8')
        (soup.find('doc')).decompose()
        query_dic[int(query_no)] = query

    # task 1
    if task == 1:
        os.chdir(project_directory)
        if not os.path.exists(output_directory):
            os.mkdir(output_directory, 0755)
        os.chdir(output_directory)

        f = open('task1_' + model + '.txt', 'w')
        for query_no in range(len(query_dic)):
            r.process_query(query_dic[query_no + 1])  # parse the query
            docs_and_scores = r.get_scores_for_docs(
                model, (query_no + 1))  # retrieve relevant documents

            # save results into appropriate file
            docs = docs_and_scores[0]
            scores = docs_and_scores[1]
            for i in range(100):
                f.write(str(query_no + 1) \
                            + " Q0 " \
                            + str(docs[i]) + ' ' \
                            + str((i+1)) + " " \
                            + str(scores[i]) + " " \
                            + model + "\n")
        f.close()

    # task 2
    if task == 2:
        # read output files from task 1
        file_name = 'task1_' + model + '.txt'
        try:
            f = open(os.path.join(output_directory, file_name), 'r')
        except:
            print "Run Task - 1 before Task - 2"
            exit()
        data = f.readlines()
        f.close()
        list_of_lines = []
        for each_line in data:
            list_of_lines.append(each_line.split(
            ))  # contains parsed lines from the task1 output file

        task1_output = {
        }  # results for each; key = query ID(number), value = list of relevant files
        for each_line in list_of_lines:
            task1_output.setdefault(int(each_line[0]), []).append(each_line[2])

        # get stopwords
        f_stop_words = open('common_words.txt', 'r')
        stop_words_list = f_stop_words.readlines()
        stop_words = [i.strip() for i in stop_words_list]
        f_stop_words.close()

        # get corpus
        os.chdir(corpus_directory)
        files_list = glob.glob('*.html')
        corpus = {}
        for each_file in files_list:
            doc_name = each_file[:len(each_file) - 5]
            text = open(each_file).read()
            corpus[doc_name] = text.split()

        file_name = 'expanded_queries_' + model + '.txt'
        f = open(os.path.join(output_directory, file_name), 'w')
        expanded_query_dic = {}
        for query_no, query in query_dic.viewitems():
            processed_query = r.process_query(query, True)
            expanded_query_dic[query_no] = Query_Expander(
                processed_query,
                task1_output[query_no],
                corpus,
                stopwords=stop_words)
            f.write(str(query_no) + " " + expanded_query_dic[query_no] + "\n")
        f.close()

        file_name = 'task2_' + model + '.txt'
        f = open(os.path.join(output_directory, file_name), 'w')
        for query_no, query in expanded_query_dic.viewitems():
            r.process_query(query)  # parse the query
            # r.clean_content(query)
            docs_and_scores = r.get_scores_for_docs(
                model, query_no)  # retrieve relevant documents

            # save results into appropriate file
            docs = docs_and_scores[0]
            scores = docs_and_scores[1]
            for i in range(100):
                f.write(str(query_no) \
                        + " Q0 " \
                        + str(docs[i]) + ' ' \
                        + str((i + 1)) + " " \
                        + str(scores[i]) + " " \
                        + model + "\n")
        f.close()
        print "Results stored in " + output_directory + " directory"
コード例 #9
0
                                                                 1]:
                index = length - i
                stemmed_corpus[doc_id] = total_stemmed_corpus[doc_id][:index]
                break

    return stemmed_corpus


stemmed_corpus = get_stemmed_corpus()
f = open('cacm_stem.query.txt', 'r')
stemmed_queries = f.readlines()
f.close()

I = Indexer.InvertedIndexer('')
I.stemmed_indexer(stemmed_corpus)
r = Retriever.Retriever('', I, os.getcwd())

f = open('task3b_' + model + '_stemmed.txt', 'w')
query_no = [12, 13, 19, 23, 24, 25, 50]
q_iter = 0
for each_query in stemmed_queries:
    r.process_query(each_query)
    docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter])

    # save results into appropriate file
    docs = docs_and_scores[0]
    scores = docs_and_scores[1]
    for i in range(100):
        f.write(str(query_no[q_iter]) \
                + " Q0 " \
                + str(docs[i]) + ' ' \