def fuzzyQuery(query, index):
    """find the word in index which is similar to query"""

    # sim keywords sequences set
    simKeywordsSeq = set()

    # input matrix
    mat = [Unigram.bitArrayGen(query)]
    for keyword in index:
        mat.append(Unigram.bitArrayGen(keyword))

    # transpose the matrix
    mat = np.array(mat).T
    hashBucket = min_hash.minHash(mat, 20, 5)

    # the first col denotes query
    for i in hashBucket:
        if 0 in hashBucket[i]:
            simKeywordsSeq.update(hashBucket[i])

    # remove the query seq
    simKeywordsSeq.remove(0)

    return [index[i - 1] for i in simKeywordsSeq]
Example #2
0
import sys 
import question as q
import unigram as u
import Unigram as U
import entropy as e

udic = U.getDictNew("trainB")
uentropy=e.entropy(udic)
print uentropy
result=[]
text = [line.strip() for line in open("../data/trainB.txt")]
question=q.getDic()
for i in range(1, 505):
	print "question "+str(i)
	#data=[]
	dic1={}	
	dic2={}
	count=0
	c1=0
	c2=0
	if i in question and i!=146:
		for line in text:
			word = line.split(" ")
			sent=word[:-1]
			w=word[-1]
			#for j in range(len(word)):
			count+=1
			#data.append((sent, word[i]))
			if q.askQuestion(i, question[i], sent):
				c1+=1
				if w in dic1:
Example #3
0
import sys 
import question as q
import unigram as u
import Unigram as U
import entropy as e

udic = U.getDictNew("trainA")
uentropy=e.entropy(udic)
print uentropy
result=[]
text = [line.strip() for line in open("../data/trainA.txt")]
question=q.getDic()
print "*********************"
print uentropy
print "*********************"
for i in range(1, 505):
	print "question "+str(i)
	#data=[]
	dic1={}	
	dic2={}
	count=0
	c1=0
	c2=0
	if i in question:
		for line in text:
			word = line.split(" ")
			sent=word[:-1]
			w=word[-1]
			#for j in range(len(word)):
			count+=1
			#data.append((sent, word[i]))