class QuerySearch(object):
	"""docstring for QuerySearch"""
	def __init__(self, posting_list_path, data_path):
		super(QuerySearch, self).__init__()
		self.posting_list_path = posting_list_path
		self.data_path = data_path
		self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
		
	def find_best_doc(self, data_dict, num_docs_retrieval=2):
		'''
			Find sum of all tf-idfs of all words in data_dict for all documents.
		'''
		
		# Dictionary. {docid: (sum of tf-idf of all words from query)}
		lst_tfidf = {}
		for key in (data_dict):
			for docid in data_dict[key]:
				try:
					lst_tfidf[docid] += self.TfidfObj.tfidf(key,docid)
				except:
					lst_tfidf[docid] = self.TfidfObj.tfidf(key,docid)
		
		# Sorting dictionary in descending manner
		# lst_tfidf = {k: v for k, v in sorted(lst_tfidf.items(), key=lambda item: -1*item[1])}
		arr = sorted(lst_tfidf.items(), key=lambda item: -1*item[1])
		return arr[:num_docs_retrieval]

	def search_query(self, query):
		'''
			Assuming query is similar to N:new* | new*
		'''
		temp = query.split(':')
		res = {}
		if(temp[0]=='P'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('person_'+temp[-1].split('*')[0])}
		elif(temp[0]=='O'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('org_'+temp[-1].split('*')[0])}
		elif(temp[0]=='L'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('loc_'+temp[-1].split('*')[0])}
		elif(temp[0]=='N'):
			res = {	
					key:val for key, val in self.TfidfObj.data.items()  
					if (key.startswith('person_'+temp[-1].split('*')[0]) 
					or key.startswith('loc_'+temp[-1].split('*')[0]) 
					or key.startswith('org_'+temp[-1].split('*')[0]))
				}
		else:
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith(temp[-1].split('*')[0])}
		# print('Refined posting list to relevant queries')
		return self.find_best_doc(res)

	def search_queries(self, queries, num_docs_retrieval=2):
		res = {}
		for query in queries:
			temp = query.split(':')
			if(len(temp)>1):
				res = dict({key:val for key, val in self.TfidfObj.data.items()  
						if (temp[0]=='P' and key.startswith('person_'+temp[-1].split('*')[0]))
						or (temp[0]=='O' and key.startswith('org_'+temp[-1].split('*')[0]))
						or (temp[0]=='L' and key.startswith('loc_'+temp[-1].split('*')[0]))
						or (temp[0]=='N' and (key.startswith('person_'+temp[-1].split('*')[0]) 
						or key.startswith('loc_'+temp[-1].split('*')[0]) 
						or key.startswith('org_'+temp[-1].split('*')[0])))
					}, **res)
			else:
				res = dict({key:val for key, val in self.TfidfObj.data.items()  
					if key.startswith(temp[-1].split('*')[0])}, **res)
		return self.find_best_doc(res, num_docs_retrieval=num_docs_retrieval)
                exit()
            else:
                print "language error!"
                exit()
            tfidfText.append(text1)
            vsText.append(text2)
    return tfidfText, vsText, np.array(polarities), categories
    
if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "sys.argv[1]: input train corpus"
        print "sys.argv[2]: input test corpus"
        print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese"
        exit()
    
    tfidfInstance = Tfidf()
    sentimentInstance = Sentiment()
    
    trCorpus1, trCorpus2, trPolarity, trCategory = readData(sys.argv[1], sys.argv[3])
    teCorpus1, teCorpus2, tePolarity, teCategory = readData(sys.argv[2], sys.argv[3])

    trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1)
    trainVS = sentimentInstance.VSPolarity(trCorpus2)
    testVS = sentimentInstance.VSPolarity(teCorpus2)
    trainMatrix = combineFeature(trainTfidf, trainVS)
    testMatrix = combineFeature(testTfidf, testVS)
    
    print sys.argv[2]
    trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity, testMatrix, tePolarity)

Exemple #3
0
                exit()
            tfidfText.append(text1)
            vsText.append(text2)
    return tfidfText, vsText, np.array(polarities), categories


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "sys.argv[1]: input train corpus"
        print "sys.argv[2]: input test corpus"
        print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese"
        exit()

    tfidfInstance = Tfidf()
    sentimentInstance = Sentiment()

    trCorpus1, trCorpus2, trPolarity, trCategory = readData(
        sys.argv[1], sys.argv[3])
    teCorpus1, teCorpus2, tePolarity, teCategory = readData(
        sys.argv[2], sys.argv[3])

    trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1)
    trainVS = sentimentInstance.VSPolarity(trCorpus2)
    testVS = sentimentInstance.VSPolarity(teCorpus2)
    trainMatrix = combineFeature(trainTfidf, trainVS)
    testMatrix = combineFeature(testTfidf, testVS)

    print sys.argv[2]
    trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity,
                                               testMatrix, tePolarity)