Ejemplo n.º 1
0
    def reader(self, query_file_name):
        query = ""
        query_counter = 1

        # Dictionary to store the values in
        # key: QueryID
        # values: [docID, scores]
        results = dict()

        with open(query_file_name, 'r', encoding='utf-8') as q_file:
            for line in q_file:
                query = line
                s = query
                # parse the input
                query = parse_query(query)
                # send it to the searcher
                self.calculate(query)

                l = len(self.result_docs)
                result_set = sorted(self.result_docs.items(),
                                    key=itemgetter(1),
                                    reverse=True)

                g = path.join(path.dirname(path.abspath(__file__)),
                              "stopped_likelihood_results.txt")
                # g = path.join("Results.txt")

                top_result_set = OrderedDict(result_set[:100])

                with open(g, 'a', encoding='utf-8') as file:
                    file.write("\nResults for Query: " + s + "\n\n")
                    file.write(
                        "{0:5} {1:<3} {2:<10} {3:<5} {4:<10} {5}\n".format(
                            "Query", "Q0", "Doc_ID", "Rank", "Score",
                            "System Name"))

                    rank_counter = 1
                    for item in top_result_set.items():
                        file.write(
                            "{0:>5} {1:<3} {2:<10} {3:<5} {4:<10} {5}\n".
                            format(
                                query_counter, "Q0", item[0], rank_counter,
                                round(item[1], 3),
                                "Smoothed_Query_Likelihood_Model_Stopped_with_1Grams"
                            ))
                        # file.write('{0:40} {1}\n'.format(item[0], item[1]))
                        rank_counter += 1

                self.result_docs.clear()
                print("Output generated for Query-" + str(query_counter))
                # results[query_counter] = result_set
                results[query_counter] = top_result_set
                query_counter += 1

        with open("sq_likelihood_stopped_baseline_results.json",
                  "w+",
                  encoding='utf-8') as file:
            # with open("sq_likelihood_baseline_results.json",
            #           "w+", encoding='utf-8') as file:
            json.dump(results, file)
def main():
    global sentences, sentence_score, stopwords, queries
    queries = genQueries()
    stopwords = getStopWords()
    for num in range(0, len(queries)):
        docs = results_dict[str(num + 1)]
        count = 0
        print("Finding results for query " + str(num + 1))
        file1.write("<br><h1>" + str(num + 1) + ". " + str(queries[num]) +
                    "</h1><br>")
        for d in docs:
            count = count + 1
            documentname = str(
                path.join(path.pardir,
                          path.join("Material/cacm.tar", d + ".html")))

            file1.write("<i>" + str(count) + "." + documentname + "</i>")
            #write query number and doc name to html file
            sentences = []
            sentences = breakDocIntoSentences(documentname)
            query_terms = parse_query(queries[num])
            important_terms, stop_terms = classifyQueryTerms(query_terms)

            sentence_score, stopword_score = scoreSentences(
                sentences, important_terms, stop_terms)
            sortAndPrint(sentence_score, stopword_score, important_terms,
                         stop_terms)
Ejemplo n.º 3
0
def generate_query_vector(query):
    global query_vector
    global magnitude_query_vector
    query_vector = {}
    q_terms = parse_query(query)
    num_terms = len(q_terms)
    q_terms_count = {}

    # get the count of every term in a query
    for term in q_terms:

        if term in q_terms_count:
            q_terms_count[term] = q_terms_count[term] + 1
        else:
            q_terms_count[term] = 1

    # for every unique term in the query
    for term in q_terms_count:
        if term in uni_index:
            query_vector[term] = (q_terms_count[term] /
                                  num_terms) * (IDF[term])
        else:
            query_vector[term] = 0
    v_sum = 0
    for term in query_vector:
        v_sum = v_sum + ((query_vector[term])**2)

    magnitude_query_vector = v_sum**0.5
Ejemplo n.º 4
0
def process(query):
    global new_queries

    q_term_length = {}
    chosen = []
    replace_terms = {}

    # get individual terms in the query
    query_terms = parse_query(query)

    # get length of the query string
    query_length = len(query_terms)

    # get length of individual query terms
    for q in query_terms:
        q_term_length[q] = len(q)

    # sort the query terms according to their lengths in non-increasing order
    sorted_q_terms = OrderedDict(
        sorted(q_term_length.items(), key=operator.itemgetter(1),
               reverse=True))

    affect_words_num = math.floor(0.4 * query_length)

    while len(chosen) != affect_words_num:
        # the random integer is generated in the range between 0 to query length because
        # when the query terms are sorted in the decreasing order of their lengths, we assume that longer terms appear
        # in the first half
        num = random.randint(0, math.ceil(query_length / 2) - 1)

        if num not in chosen:
            chosen.append(num)

    items = list(sorted_q_terms.items())

    # Calculate affect range based on the
    # query term size.
    q_list = query.split()
    avg = sum(map(len, q_list)) / len(q_list)

    for index in chosen:
        token = items[index][0]

        # the token should be replaced by the shuffled word in the query
        replace_terms[token] = shuffle_token(token, avg)

    qy = query
    for term in replace_terms:
        # replace the query term with the noisy term
        qy = qy.replace(term, replace_terms[term], 1)

    new_queries.append(qy)
Ejemplo n.º 5
0
    def search(self, query):
        # parse the input
        self.query = parse_query(query)

        self.terms = self.query

        self.results_dict = {}

        # Call function to process the scores
        self.process()

        r = sorted(self.results_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)
        return r
Ejemplo n.º 6
0
def process(q):
    q_terms = parse_query(q)

    new_query_terms = []

    # pass every term in the query to the corrector
    for term in q_terms:
        if len(term) > 3:
            corrected_term = correction(term)
        else:
            corrected_term = term

        # append the possible correction for every term to a list
        new_query_terms.append(corrected_term)

    # form a new query
    new_query = ' '.join(new_query_terms)

    return new_query
def breakDocIntoSentences(file_name):
    soup = BeautifulSoup(codecs.open(file_name, 'r', encoding="utf8"),
                         "html.parser")
    text = str(soup.getText()).encode('ascii', 'ignore')
    text = text.decode('utf-8', 'ignore')

    # get the text of the document

    # break it into sentences of size 8
    words = parse_query(text)

    sentences = []
    sentence = []
    for word in words:
        if (len(sentence) < sentence_size):
            sentence.append(word)
        else:
            sentences.append(sentence)
            sentence = []
            sentence.append(word)
    if (len(sentence) <= sentence_size):
        sentences.append(sentence)

    return sentences
Ejemplo n.º 8
0
def index(request):
    '''
    Project's Main View

    It handles three cases:
    1. Landing page
    2. Preliminary results (top 10 places according to CheckIns)
    3. Optimized Itinerary
    '''

    cities = City.objects.order_by('city_name')

    categories_to_display = Category.objects.values(
        'user_category', 'user_cat_id').annotate(
            dcount=Count('user_category')).order_by('user_category')

    user_query = ''

    context = {
        'cities': cities,
        'categories': categories_to_display,
        'user_query': user_query
    }

    #T1: user accesses application for the first time
    if request.method == 'GET' and 'query_id_from_form' not in request.GET:
        return render(request, 'itinerary/main_form.html', context)

    #T2: user sends Main Form to search for places
    if request.method == 'POST':

        initial_form_data = request.POST

        #show error page if form is empty
        if initial_form_data.get('query_city') == '':
            return render(request, 'itinerary/input_error1.html', context)

        proper_city = parse_city(initial_form_data.get('query_city'))

        if proper_city == None:
            return render(request, 'itinerary/input_error2.html', context)

        proper_query_obj, user_categories = parse_query(
            initial_form_data, proper_city)

        places_list_to_user = places_from_foursquare(proper_query_obj,
                                                     user_categories,
                                                     multi=True)

        context['lista'] = places_list_to_user
        context['query_id'] = proper_query_obj.id
        context['user_query'] = proper_query_obj

        return render(request, 'itinerary/preliminary_list.html', context)

    #T3: user sends Second Form to get optimized places
    if request.method == 'GET' and 'query_id_from_form' in request.GET:
        #receive 2nd form and process logans code
        second_form_data = request.GET

        query_id = second_form_data.get('query_id_from_form')

        user_query = UserQuery.objects.get(id=query_id)

        #modified places to be a dict. correct implementation?
        places_preferences = {}

        for key in second_form_data:
            if 'ur_' in key:
                id_place = key[3:]
                places_preferences[id_place] = [
                    Place.objects.get(id_str=id_place), second_form_data[key]
                ]

        optimal_places_order, transit_exceptions = optimize(
            user_query, places_preferences)

        final_places_list = []
        for id_place, begin_time, end_time in optimal_places_order:
            place_aux = Place.objects.get(id_str=id_place)
            place_aux.begin_time = begin_time
            place_aux.end_time = end_time
            if id_place in transit_exceptions:
                place_aux.exception = transit_exceptions[id_place]
            final_places_list.append(place_aux)

        context['final_places_list'] = final_places_list
        context['transit_exceptions'] = transit_exceptions
        context['query'] = user_query

        return render(request, 'itinerary/final_results.html', context)