def transAndSearch(query): idx, query = query.split(query[2],1) query = cleanContent(query) sys.stderr.write(query+'\n') transQuery = [] for q in query.split(): res = build_model.transliterate(q) transQuery.append(res[0]) transQuery = ' '.join(transQuery) sys.stderr.write(transQuery+'\n') res = queryDocs(transQuery)[:30] resDict = [] resDict.append(transQuery) fireFinal = [] rank = 1 maxScore = 0 for doc, score in res: resDict.append({'score': score, 'content': doc}) fireFinal.append({'score': score, 'doc': doc, 'rank': rank}) maxScore = max(maxScore, score) # print idx, 'Q0', doc, rank, score rank += 1 for x in fireFinal: print idx, 'Q0', x['doc'], x['rank'], x['score']/float(maxScore) sys.stdout.flush()
def POST(self): query = web.input().query query = cleanContent(query) print query transQuery = [] for q in query.split(): res = build_model.transliterate(q) transQuery.append(res[0]) # res = build_model.transliterateAll(q)[:3] # prescore = 0 # finalWord = res[0][0] # for w, _ in res: # print w # print globalDict.query(w) # for _foo in globalDict.query(w): # dicword, score = _foo # if score > prescore: # prescore = score # finalWord = dicword[0] # print finalWord # transQuery.append(finalWord) transQuery = ' '.join(transQuery) print transQuery res = queryDocs(transQuery)[:10] resDict = [] resDict.append(transQuery) for doc, score in res: resDict.append({'score': score, 'content': getDocumentFromIdx(doc)}) return json.dumps(resDict)
def POST(self): query = web.input().query retRes = [] retScore = [] for q in query.split(): res, score = build_model.transliterate(q) retRes.append(res.decode('utf-8')), retScore.append(str(score)) return u'%s<br />%s<br />%s' % (query, ' '.join(retRes), ' '.join(retScore))
def processLine(line): contentWords = [] for word in line.split(): word = filterWord(word) if not isHindi(word) and not isDigits(word): word = word.lower() prevWord = word word = toUnicode(build_model.transliterate(word)[0]) if len(word) == 0: word = prevWord if word == u'': continue contentWords.append(word) return contentWords
from collections import defaultdict # documents = load_obj('../documents.dat') # graph = load_obj('graph.dat') globalDict = load_obj('../data/global_dict.dat') globalDictTitle = globalDict['title'] globalDict = globalDict['content'] idf = load_obj('../data/fire_docs/idf_fire.dat') # graph = graph['docGraph'] graphPath = '../data/fire_graph/graph' idxToFile = load_obj('../data/idx_to_file.dat') candidateTermsNum = 3 similarityThreshold = 0.7 intentWords = set() for intentW in [u'lyric', u'lyrics', u'review', u'reviews', u'movie', u'movies', u'song', u'songs']: intentWords.add(build_model.transliterate(intentW)[0].decode('utf-8')) # print 'Initialized..' documents = [] for i in range(65): sys.stderr.write('loading document %d\n' % i) documents.append(load_obj('../data/fire_docs/documents_fire.dat.%d'%i)) def getTitle(idx): docid = idx // 1000 docid += 1 return set(documents[docid][idx%1000]['title']) class dictDisk: def __init__(self, alt=""): self.cacheDict = {} self.cacheSize = 10000