def create_freqDict_list(doc_info_list, w1, w2, w3):
    freq_dict_list = []
    for doc in doc_info_list:
        # pprint(doc)
        freq_dict = {}
        weight_dict = {}
        question_tokens = tokenizer(doc['question'])
        option_tokens = tokenizer(doc['option'])
        answer_tokens = tokenizer(doc['answer'])
        for token in answer_tokens:
            if token in freq_dict:
                freq_dict[token] += 1
            else:
                weight_dict[token] = w3
                freq_dict[token] = 1
        for token in option_tokens:
            if token in freq_dict:
                freq_dict[token] += 1
            else:
                weight_dict[token] = w2
                freq_dict[token] = 1
        for token in question_tokens:
            if token in freq_dict:
                freq_dict[token] += 1
            else:
                weight_dict[token] = w1
                freq_dict[token] = 1

        temp = {'doc_id': doc['doc_id'], 'freq_list': freq_dict, 'weight_list': weight_dict, 'count': doc['count']}
        # pprint(temp)
        freq_dict_list.append(temp)
    # pprint(freq_dict_list)
    return freq_dict_list
def get_scentences(data_dict):
    scentences = []
    for key, values in data_dict.items():
        #print(values)
        content = ''
        for value in values:
            clean_data = cleaning_data(value)
            #print(clean_data)
            content += clean_data + ' '
        lines_token = tokenizer(content)
        scentences.append(lines_token)
    return scentences
Beispiel #3
0
def searching(query_input, tf_idf):
    query_clean = cleaning_data(query_input)
    query_token = tokenizer(query_clean)
    #print(query_token)
    query_vector, vsm = VSM(query_token, tf_idf)
    #pprint(vsm)
    pprint(query_vector)
    sim = cosine_similarity(query_vector, vsm)
    #pprint(sim)
    ranked_doc = ranking(sim)
    pprint(ranked_doc)
    return ranked_doc
def word_count(str):
    tokens = tokenizer(str)
    # print(tokens)
    return len(tokens)
def word_count(str):
    tokens = processingdata.tokenizer(str)
    #print(tokens)
    return len(tokens)