def fuzzyQuery(query, index): """find the word in index which is similar to query""" # sim keywords sequences set simKeywordsSeq = set() # input matrix mat = [Unigram.bitArrayGen(query)] for keyword in index: mat.append(Unigram.bitArrayGen(keyword)) # transpose the matrix mat = np.array(mat).T hashBucket = min_hash.minHash(mat, 20, 5) # the first col denotes query for i in hashBucket: if 0 in hashBucket[i]: simKeywordsSeq.update(hashBucket[i]) # remove the query seq simKeywordsSeq.remove(0) return [index[i - 1] for i in simKeywordsSeq]
import sys import question as q import unigram as u import Unigram as U import entropy as e udic = U.getDictNew("trainB") uentropy=e.entropy(udic) print uentropy result=[] text = [line.strip() for line in open("../data/trainB.txt")] question=q.getDic() for i in range(1, 505): print "question "+str(i) #data=[] dic1={} dic2={} count=0 c1=0 c2=0 if i in question and i!=146: for line in text: word = line.split(" ") sent=word[:-1] w=word[-1] #for j in range(len(word)): count+=1 #data.append((sent, word[i])) if q.askQuestion(i, question[i], sent): c1+=1 if w in dic1:
import sys import question as q import unigram as u import Unigram as U import entropy as e udic = U.getDictNew("trainA") uentropy=e.entropy(udic) print uentropy result=[] text = [line.strip() for line in open("../data/trainA.txt")] question=q.getDic() print "*********************" print uentropy print "*********************" for i in range(1, 505): print "question "+str(i) #data=[] dic1={} dic2={} count=0 c1=0 c2=0 if i in question: for line in text: word = line.split(" ") sent=word[:-1] w=word[-1] #for j in range(len(word)): count+=1 #data.append((sent, word[i]))