def linkStatQuery(self, linktype, links): statquery = bytearray("Q") for link in links: statquery.extend(b'T') statquery.extend(strusMessage.packString(linktype)) statquery.extend(strusMessage.packString(link.title)) statquery.extend(b'N') return statquery
def termStatQuery(self, terms): statquery = bytearray(b"Q") for term in terms: statquery.extend(b'T') statquery.extend(strusMessage.packString(term.type)) statquery.extend(strusMessage.packString(term.value)) statquery.extend(b'N') return statquery
def evaluateQuery(self, scheme, querystruct, firstrank, nofranks, restrictdn, with_debuginfo): rt = None try: maxnofresults = firstrank + nofranks terms = querystruct.terms if not terms: # Return empty result for empty query: rt = [[], []] else: # Get the global statistics: dflist, collectionsize, error = yield self.queryStatserver( self.termStatQuery(terms)) if not error is None: raise Exception(error) # Assemble the query: qry = bytearray(b"Q") qry.extend(bytearray(b"M")) qry.extend(strusMessage.packString(scheme)) qry.extend(bytearray(b"S")) qry.extend(struct.pack(">q", collectionsize)) qry.extend(bytearray(b"I")) qry.extend(struct.pack(">H", 0)) qry.extend(bytearray(b"N")) qry.extend(struct.pack(">H", maxnofresults)) if with_debuginfo: qry.extend(bytearray(b"B")) if restrictdn != 0: qry.extend(bytearray(b"D")) qry.extend(struct.pack(">I", restrictdn)) for ii in range(0, len(terms)): qry.extend(bytearray(b"T")) qry.extend(strusMessage.packString(terms[ii].type)) qry.extend(strusMessage.packString(terms[ii].value)) if (terms[ii].length): qry.extend( struct.pack(">Hqd", terms[ii].length, dflist[ii], 1.0)) else: qry.extend(struct.pack(">Hqd", 1, dflist[ii], 1.0)) for lnk in querystruct.links: qry.extend(bytearray(b"L")) qry.extend(strusMessage.packString("vectfeat")) qry.extend(strusMessage.packString(lnk.title)) qry.extend(struct.pack(">d", lnk.weight)) # Query all storage servers: results = yield self.issueQueries(storageservers, scheme, qry) rt = self.mergeQueryResults(results, firstrank, nofranks) except Exception as e: rt = ([], ["error evaluation query: %s" % e]) raise tornado.gen.Return(rt)
def processCommand(message): rt = bytearray(b"Y") try: messagesize = len(message) messageofs = 1 if message[0] == ord('Q'): # QUERY: Term = collections.namedtuple( 'Term', ['type', 'value', 'length', 'df', 'weight']) nofranks = 20 restrictdn = 0 collectionsize = 0 firstrank = 0 scheme = "BM25" terms = [] links = [] with_debuginfo = False # Build query to evaluate from the request: messagesize = len(message) while (messageofs < messagesize): if message[messageofs] == ord('I'): (firstrank, ) = struct.unpack_from(">H", message, messageofs + 1) messageofs += struct.calcsize(">H") + 1 elif message[messageofs] == ord('N'): (nofranks, ) = struct.unpack_from(">H", message, messageofs + 1) messageofs += struct.calcsize(">H") + 1 elif message[messageofs] == ord('D'): (restrictdn, ) = struct.unpack_from( ">I", message, messageofs + 1) messageofs += struct.calcsize(">I") + 1 elif message[messageofs] == ord('M'): (scheme, messageofs) = strusMessage.unpackString( message, messageofs + 1) elif message[messageofs] == ord('S'): (collectionsize, ) = struct.unpack_from( ">q", message, messageofs + 1) messageofs += struct.calcsize(">q") + 1 elif message[messageofs] == ord('T'): (type, messageofs) = strusMessage.unpackString( message, messageofs + 1) (value, messageofs) = strusMessage.unpackString( message, messageofs) (length, df, weight) = struct.unpack_from(">Hqd", message, messageofs) messageofs += struct.calcsize(">Hqd") terms.append(Term(type, value, length, df, weight)) elif message[messageofs] == ord('L'): (type, messageofs) = strusMessage.unpackString( message, messageofs + 1) (value, messageofs) = strusMessage.unpackString( message, messageofs) (weight, ) = struct.unpack_from(">d", message, messageofs) messageofs += struct.calcsize(">d") links.append(Term(type, value, 1, 0, weight)) elif message[messageofs] == ord('B'): messageofs += 1 with_debuginfo = True else: raise tornado.gen.Return(b"Eunknown parameter") if (with_debuginfo or debugtrace): backend.enableDebugTrace() doTitleSelect = isStopWordsOnlyQuery(terms, collectionsize) # ... if we have a query containing only stopwords, we reduce our search space to # the documents containing some query terms in the title and the most referenced # documents in the collection. # Evaluate query: if restrictdn == 0: results = backend.evaluateQuery(scheme, doTitleSelect, terms, links, collectionsize, firstrank, nofranks, [], debugtrace, with_debuginfo) else: results = backend.evaluateQuery(scheme, doTitleSelect, terms, links, collectionsize, firstrank, nofranks, [restrictdn], debugtrace, with_debuginfo) # Build the result and pack it into the reply message for the client: rt.extend(b'Z') rt.extend(struct.pack(">H", serverno)) if scheme == "NBLNK" or scheme == "TILNK" or scheme == "VCLNK": for result in results: rt.extend(b'_') rt.extend(b'D') rt.extend(struct.pack(">I", result.docno)) rt.extend(b'W') rt.extend(struct.pack(">d", result.weight)) for linkid, weight in result.links: rt.extend(b'L') rt.extend(strusMessage.packString(linkid)) rt.extend(struct.pack(">d", weight)) elif scheme == "STDLNK": for result in results: rt.extend(b'_') rt.extend(b'D') rt.extend(struct.pack(">I", result.docno)) rt.extend(b'W') rt.extend(struct.pack(">d", result.weight)) for linkid, weight in result.links: rt.extend(b'L') rt.extend(strusMessage.packString(linkid)) rt.extend(struct.pack(">d", weight)) for linkid, weight in result.titles: rt.extend(b'T') rt.extend(strusMessage.packString(linkid)) rt.extend(struct.pack(">d", weight)) for featid, weight in result.features: rt.extend(b'F') rt.extend(strusMessage.packString(featid)) rt.extend(struct.pack(">d", weight)) else: for result in results: rt.extend(b'_') rt.extend(b'D') rt.extend(struct.pack(">I", result.docno)) rt.extend(b'W') rt.extend(struct.pack(">d", result.weight)) rt.extend(b'T') rt.extend(strusMessage.packString(result.title)) if result.paratitle: rt.extend(b'P') rt.extend(strusMessage.packString(result.paratitle)) if result.debuginfo: rt.extend(b'B') rt.extend(strusMessage.packString(result.debuginfo)) rt.extend(b'A') rt.extend(strusMessage.packString(result.abstract)) if (with_debuginfo or debugtrace): backend.printDebugTrace() backend.disableDebugTrace() else: raise Exception("unknown protocol command '%c'" % (message[0])) except Exception as e: raise tornado.gen.Return(bytearray("E%s" % e, 'utf-8')) raise tornado.gen.Return(rt)
def analyzeQuery(self, scheme, querystr, nofranks): terms = [] relatedterms = [] errors = [] conn = None try: query = bytearray(b"Q") query.extend(b'X') query.extend(strusMessage.packString(querystr)) query.extend(b'N') query.extend(struct.pack(">H", nofranks)) ri = qryserver.rindex(':') host, port = qryserver[:ri], int(qryserver[ri + 1:]) conn = yield msgclient.connect(host, port) reply = yield msgclient.issueRequest(conn, query) if reply[0] == ord('E'): raise Exception("failed to query analyze server: %s" % reply[1:]) elif reply[0] != ord('Y'): raise Exception("protocol error in query analyze server query") replyofs = 1 replylen = len(reply) while replyofs < replylen: if reply[replyofs] == ord('T'): replyofs += 1 type = None value = None length = 1 while replyofs < replylen: if reply[replyofs] == ord('T'): (type, replyofs) = strusMessage.unpackString( reply, replyofs + 1) elif reply[replyofs] == ord('V'): (value, replyofs) = strusMessage.unpackString( reply, replyofs + 1) elif reply[replyofs] == ord('L'): (length, ) = struct.unpack_from( ">I", reply, replyofs + 1) replyofs += struct.calcsize(">I") + 1 elif reply[replyofs] == ord('_'): replyofs += 1 break terms.append(QueryTerm(type, value, length, 1.0)) elif reply[replyofs] == ord('R'): replyofs += 1 value = None index = -1 weight = 0.0 while replyofs < replylen: if reply[replyofs] == ord('V'): (value, replyofs) = strusMessage.unpackString( reply, replyofs + 1) elif reply[replyofs] == ord('I'): (index, ) = struct.unpack_from( ">I", reply, replyofs + 1) replyofs += struct.calcsize(">I") + 1 elif reply[replyofs] == ord('W'): (weight, ) = struct.unpack_from( ">d", reply, replyofs + 1) replyofs += struct.calcsize(">d") + 1 elif reply[replyofs] == ord('_'): replyofs += 1 break valuestr = value.replace('_', ' ') if (valuestr.lower() != querystr.lower()): encvalue = urllib.parse.quote(valuestr) relatedterms.append( RelatedTerm(valuestr, encvalue, index, weight)) else: break if replyofs != replylen: raise Exception("query analyze server result format error") conn.close() except Exception as e: errors.append("query analyze server request failed: %s" % e) if conn: conn.close() alt_terms = analyzer.analyzeTermExpression(["text", querystr]) for term in alt_terms: terms.append(QueryTerm(term.type, term.value, term.length, 1.0)) raise tornado.gen.Return(QueryStruct(terms, [], relatedterms, errors))
def processCommand(message): rt = bytearray(b"Y") try: if debugtrace: strusctx.enableDebugTrace("analyzer") messagesize = len(message) if messagesize < 1: raise tornado.gen.Return(b"Eempty request string") messageofs = 1 if message[0] == ord('Q'): # Build query to evaluate from the request: while (messageofs < messagesize): if (message[messageofs] == ord('N')): (nofranks, ) = struct.unpack_from(">H", message, messageofs + 1) messageofs += struct.calcsize(">H") + 1 elif (message[messageofs] == ord('X')): (querystr, messageofs) = strusMessage.unpackString( message, messageofs + 1) else: raise tornado.gen.Return(b"Eunknown parameter") # Analyze query: relatedlist = [] terms = analyzer.analyzeTermExpression([["text", querystr], ["seltext", querystr]]) # Extract vectors referenced: f_indices = [] for term in terms: if term.value[0] == 'F': f_indices.append(int(term.value[1:])) # Build real list of features for retrieval in the searchindex: pos2term = {} pos = 0 for term in terms: if term.type != "selstem": if term.length and term.length > 1: pos2term[pos] = AnalyzerTerm(term.type, term.value, term.length) pos += term.length elif term.type == "stem": pos2term[pos] = AnalyzerTerm(term.type, term.value, 1) pos += 1 pos = 0 for term in terms: if term.type == "selstem": if not pos in pos2term: pos2term[pos] = AnalyzerTerm("stem", term.value, 1) pos += 1 finalterms = [] for pos, term in pos2term.items(): finalterms.append(term) terms = finalterms # Calculate nearest neighbours of vectors exctracted: if f_indices: vec = vecstorage.featureVector(f_indices[0]) if len(f_indices) > 1: for nextidx in f_indices[1:]: vec = [ v + i for v, i in zip( vec, vecstorage.featureVector(nextidx)) ] neighbour_ranklist = vecsearcher.findSimilar(vec, nofranks) else: neighbour_list = [] neighbour_set = set() for concept in vecstorage.featureConcepts( "", f_indices[0]): for neighbour in vecstorage.conceptFeatures( "", concept): neighbour_set.add(neighbour) for neighbour in neighbour_set: neighbour_list.append(neighbour) neighbour_ranklist = vecsearcher.findSimilarFromSelection( neighbour_list, vec, nofranks) for neighbour in neighbour_ranklist: fname = vecstorage.featureName(neighbour.featidx) relatedlist.append( RelatedTerm(fname, neighbour.featidx, neighbour.weight)) # Build the result and pack it into the reply message for the client: for termidx, term in enumerate(terms): rt.extend(b'T') rt.extend(b'T') rt.extend(strusMessage.packString(term.type)) rt.extend(b'V') rt.extend(strusMessage.packString(term.value)) if (term.length): rt.extend(b'L') rt.extend(struct.pack(">I", term.length)) rt.extend(b'_') for related in relatedlist: rt.extend(b'R') rt.extend(b'V') rt.extend(strusMessage.packString(related.value)) rt.extend(b'I') rt.extend(struct.pack(">I", related.index)) rt.extend(b'W') rt.extend(struct.pack(">d", related.weight)) rt.extend(b'_') else: if debugtrace: strusctx.disableDebugTrace("analyzer") raise Exception("unknown protocol command '%c'" % (message[0])) except Exception as e: if debugtrace: strusctx.disableDebugTrace("analyzer") raise tornado.gen.Return(bytearray("E%s" % e, 'utf-8')) if debugtrace: dumpDebugTrace(strusctx.fetchDebugTrace(), "", 5) strusctx.disableDebugTrace("analyzer") raise tornado.gen.Return(rt)