def run4(querylist):
	documents = []
	namelist = []
	docdir = "./documents/"
	for root , dirs , files in os.walk(docdir):
		for filename in files:
			f = open(docdir + filename , 'r')
			filename = filename.split(".")
			namelist.append(filename[0])
			line = f.read()
			documents.append(line)
			f.close()
	
	vectorSpace= VectorSpace()
    #pprint(vectorSpace.related(1))
	result = vectorSpace.searchtfjab(querylist , documents)
	#print result
	order = sorted(result,reverse = True)
	print "TF-IDF Weighting + Jaccard Similarity:\n"
	#print type(result)
	#print order
	print "DocID    Score"
	for i in range(5):
		#print str(namelist[result.index(order[i])]) + "   " + str(order[i])
		print str(namelist[result.index(order[i])]) + "   %.6f" % order[i]
Beispiel #2
0
def main(argv):
    doc_list = ''
    query = ''
    relevanceType = ''
    weightType = ''
    feedback = False
    try:
        opts, args = getopt.getopt(argv, "d:w:r:q:f",
                                   ["doc=", "weight=", "relevance=", "query="])
    except getopt.GetoptError:
        print(
            'main.py -d <doc_list_folder> -w <weight_type> -r <relevance_type> -q \'<querylist>\''
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                'main.py -d <doc_list_folder> -w <weight_type> -r <relevance_type> -q \'<querylist>\''
            )
            sys.exit()
        elif opt in ("-d", "--doc"):
            doc_list = arg
        elif opt in ('-q', '--query'):
            query = arg
        elif opt in ('-w', '--weight'):
            weightType = arg
        elif opt in ('-r', '--relevance'):
            relevanceType = arg
        elif opt in ('-f', '--feedback'):
            feedback = True
    query = query.split(' ')
    documents, doc_name = docConverter(doc_list)
    v = VectorSpace(documents, query, weightType)
    scores = v.search(relevanceType)
    if feedback:
        first_doc = doc_name[scores.index(max(scores))]
        f = open(os.path.join(doc_list, f'{first_doc}.txt'), 'r')
        ret = f.read()
        fq = v.feedback(ret)
        f.close()
        weightType = 'Feedback Queries + ' + weightType
        showResults(fq, doc_name, weightType, relevanceType)
    else:
        showResults(scores, doc_name, weightType, relevanceType)
Beispiel #3
0
def load_data(file_dir):
    files = os.listdir(file_dir)

    documents = []

    # update documents
    for f in files:
        with open("{}/{}".format(file_dir, f)) as file:
            documents.append(file.read())

    vectorSpace = VectorSpace(documents)

    return vectorSpace, files
Beispiel #4
0
def get_vsm(all_content, all_id):
    assert len(all_content) >= 1, "Document in the collection should be more than one"
    try:
        print('Try to load model......')
        with open(MODEL_PATH, 'rb') as reader:
            vector_space = pickle.load(reader)
        print("Model Found !")
    except FileNotFoundError:
        print('Model not found......')
        vector_space = VectorSpace(all_content, all_id)   
        with open(MODEL_PATH, 'wb') as writer:
            pickle.dump(vector_space, writer)
        print('Save built model......')

    return vector_space
def form(result):
    print('NewsID           Score')
    print('------------------------')
    # print(len(result))
    for i in range(len(result)):
        print(result[i])


if __name__ == '__main__':
    query = return_query().query.split(" ")
    print(query)
    document = {}

    for txt in files_path:
        directory_path = os.getcwd()+"/EnglishNews"
        txt_path = directory_path + '/' + txt
        with open(txt_path, "r") as f:
            document[txt] = f.read()

    vectorspace = VectorSpace(document)
    form(vectorspace.search(query))
    form(vectorspace.search_eul(query))
    form(vectorspace.search_tfidf(query))
    form(vectorspace.search_tfidf_eul(query))

    subqueryID = vectorspace.search_tfidf(query)[0][0]
    subqueryVector = vectorspace.buildSubVector(subqueryID)
    finalVector = np.array(vectorspace.buildQueryVector(
        query, method="1")) + subqueryVector
    form(vectorspace.search_nltk(finalVector))
Beispiel #6
0
def main(query):

    #create vector space model instance
    vectorSpace = VectorSpace(documents)

    #caculate different conmbinations
    tf_cos = vectorSpace.TF_Cosine(query)
    tf_euclidean = vectorSpace.TF_Euclidean(query)
    tfidf_cos = vectorSpace.TFIDF_Cosine(query)
    tfidf_euclidean = vectorSpace.TFIDF_Euclidean(query)

    #sort with top five score
    top5_tf_cos = sorted(list(zip(indexList, tf_cos)),
                         reverse=True,
                         key=lambda x: x[1])[:5]
    top5_tf_euclidean = sorted(list(zip(indexList, tf_euclidean)),
                               reverse=False,
                               key=lambda x: x[1])[:5]
    top5_tfidf_cos = sorted(list(zip(indexList, tfidf_cos)),
                            reverse=True,
                            key=lambda x: x[1])[:5]
    top5_tfidf_euclidean = sorted(list(zip(indexList, tfidf_euclidean)),
                                  reverse=False,
                                  key=lambda x: x[1])[:5]

    #print out the output
    print('Term Frequency Weighting + Cosine Similarity:')
    print_top(top5_tf_cos)

    print('Term Frequency Weighting + Euclidean Distance:')
    print_top(top5_tf_euclidean)

    print('TF-IDF Weighting + Cosine Similarity:')
    print_top(top5_tfidf_cos)

    print('TF-IDF Weighting + Euclidean Distance:')
    print_top(top5_tfidf_euclidean)

    #Relevance Feedback

    #get the document of the first score of the tfidf + cosine similarity by given query
    indx_fb = indexList.index(top5_tfidf_cos[0][0])
    fb = documents[indx_fb]

    #the new query term weighting scheme is [1 * original query + 0.5 * feedback query]
    feedback_vector = vectorSpace.makeFeedbackVector(fb)
    query_vector = np.array(vectorSpace.makeTfIdfVector(query))
    rf_vector = query_vector + feedback_vector

    # evaluate the relevance vector with each document by tfidf + cosine similarity
    rf_tfidf_cos = []
    for documentTFIDFVector in vectorSpace.documentTFIDFVectors:
        rf_tfidf_cos.append(util.cosine(rf_vector, documentTFIDFVector))

    top5_rf_tfidf_cos = sorted(list(zip(indexList, rf_tfidf_cos)),
                               reverse=True,
                               key=lambda x: x[1])[:5]

    #print out the output
    print('Relevance Feedback + TF-IDF Weighting + Cosine Similarity:')
    print_top(top5_rf_tfidf_cos)
Beispiel #7
0
def load_files_in_dir():
    path = os.getcwd() + '\\EnglishNews'
    all_files = os.listdir(path)
    
    return all_files

def read_documents(file_list):
    documents = {}
    news_path = os.getcwd() + '\\EnglishNews'
    for file in file_list:
        file_path = news_path +'\\' + file
        with open(file_path, 'r', encoding = 'utf-8') as f:
            documents[file[:-4]] = f.read()
    
    return documents

def result(sorted_dic_10):
    print('\nNews ID                Score')
    print('------------           ------------')
    for key, value in sorted_dic_10.items():
        print(key, '           ', round(value, 7))

if __name__ == '__main__':
    
    query = get_query().query
    queryList = list(query.split(' '))
    print("my query is ", queryList)
    engNewsList = load_files_in_dir()
    documents = read_documents(engNewsList)
    vectorSpace_tf = VectorSpace(documents)
    
Beispiel #8
0
def result(sorted_dic_10):
    print('\nNews ID                Score')
    print('------------           ------------')
    for key, value in sorted_dic_10.items():
        print(key, '           ', round(value, 7))


if __name__ == '__main__':

    query = get_query().query
    queryList = list(query.split(' '))
    print("my query is ", queryList)
    engNewsList = load_files_in_dir()
    documents = read_documents(engNewsList)
    vectorSpace_tf = VectorSpace(documents)

    #1-1
    print('')
    print('WSM Project 1: Ranking by Vector Space Models\n')
    print('1-1')
    print('-------------------------------------')
    print('Term Frequency Weighting + Cosine Similarity')
    sorted_ratings_1 = vectorSpace_tf.search(queryList)
    top_10_tf_cos = dict(list(sorted_ratings_1.items())[:10])
    result(top_10_tf_cos)

    #1-2
    print('')
    print('1-2')
    print('-------------------------------------')
Beispiel #9
0
from VectorSpace import VectorSpace
import pandas as pd
import numpy as np
import nltk
import util

file_path = "../documents"
files = os.listdir(file_path)

documents = []

for f in files:
    with open("../documents/{}".format(f)) as file:
        documents.append(file.read())

vectorSpace = VectorSpace(documents)

# tfidf + cos
print("TF-IDF Weighting + Cosine Similarity")

# calculate
scores = vectorSpace.search(["drill wood sharp"], "cos", idf=True)
# Indices of N largest elements in list
indices = np.argpartition(scores, -5)[-5:]

# save as (index, value)
d = {}
for i in indices:
    d[i] = scores[i]

# sort dict by value instead of key