def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], raw_results=False, **metadata): """query the PhiloLogic database""" method = method or "proxy" if isinstance(method_arg, str): try: method_arg = int(method_arg) except: if method == "cooc" or method == "sentence": method_arg = 6 else: method_arg = 0 if isinstance(limit, str): try: limit = int(limit) except: limit = 10000000 hash = hashlib.sha1() hash.update(self.path.encode('utf8')) has_metadata = False corpus_file = None for key, value in list(metadata.items()): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True key_value = "%s=%s" % (key, "|".join(value)) hash.update(key_value.encode('utf8')) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in list(metadata.items()): for i, params in enumerate(self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"'] else: metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append({"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order) else: if sort_order == ["rowid"]: sort_order = None corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order, raw=raw_results) corpus.finish() if len(corpus) == 0: return corpus else: corpus = None if qs: hash.update(qs.encode('utf8')) hash.update(method.encode('utf8')) hash.update(str(method_arg).encode('utf8')) hash.update(str(limit).encode('utf8')) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if sort_order == ["rowid"]: sort_order = None if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order, raw_results=raw_results) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order, raw=raw_results) else: if corpus: return corpus else: return self.get_all(self.locals["default_object_level"], sort_order)
def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], **metadata): """query the PhiloLogic database""" method = method or "proxy" if isinstance(method_arg, str): try: method_arg = int(method_arg) except: if method == "cooc" or method == "sentence": method_arg = 6 else: method_arg = 0 if isinstance(limit, str): try: limit = int(limit) except: limit = 10000000 hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key, value in metadata.items(): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key, "|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in metadata.items(): for i, params in enumerate(self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"'] else: metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append({"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order) else: # print >> sys.stderr, "cached @ %s" % corpus_file if sort_order == ["rowid"]: sort_order = None corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order) corpus.finish() #print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: # words_per_hit = len(qs.split(" ")) # words_per_hit = len(qs.split("\n\n")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if sort_order == ["rowid"]: sort_order = None if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order) else: if corpus: return corpus else: return self.get_all("doc", sort_order)
def query(self, qs="", method="", method_arg=0, limit=10000000, **metadata): """query the PhiloLogic database""" hash = hashlib.sha1() hash.update(self.path) has_metadata = False corpus_file = None for key, value in metadata.items(): if isinstance(value, str): if value == "": pass else: value = [value] metadata[key] = value value = [v for v in value if v] if value: has_metadata = True hash.update("%s=%s" % (key, "|".join(value))) if has_metadata: corpus_hash = hash.hexdigest() corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist" corpus_width = 7 if not os.path.isfile(corpus_file): # before we query, we need to figure out what type each parameter belongs to, # and sort them into a list of dictionaries, one for each type. metadata_dicts = [ {} for level in self.locals["metadata_hierarchy"] ] # print >> sys.stderr, "querying %s" % repr(metadata.items()) for k, v in metadata.items(): for i, params in enumerate( self.locals["metadata_hierarchy"]): if v and (k in params): metadata_dicts[i][k] = v if k in self.locals["metadata_types"]: this_type = self.locals["metadata_types"][k] if this_type == "div": metadata_dicts[i]["philo_type"] = [ '"div"|"div1"|"div2"|"div3"' ] else: metadata_dicts[i]["philo_type"] = [ '"%s"' % self.locals["metadata_types"][k] ] metadata_dicts = [d for d in metadata_dicts if d] if "philo_id" in metadata: if metadata_dicts: metadata_dicts[-1]["philo_id"] = metadata["philo_id"] else: metadata_dicts.append( {"philo_id": metadata["philo_id"]}) corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts) else: # print >> sys.stderr, "cached @ %s" % corpus_file corpus = HitList.HitList(corpus_file, 0, self) corpus.finish() #print >> sys.stderr, "corpus file of length %d" % len(corpus) if len(corpus) == 0: return corpus else: corpus = None if qs: # words_per_hit = len(qs.split(" ")) # words_per_hit = len(qs.split("\n\n")) hash.update(qs) hash.update(method) hash.update(str(method_arg)) hash.update(str(limit)) search_hash = hash.hexdigest() search_file = self.path + "/hitlists/" + search_hash + ".hitlist" if not os.path.isfile(search_file): return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file) else: parsed = QuerySyntax.parse_query(qs) grouped = QuerySyntax.group_terms(parsed) split = Query.split_terms(grouped) words_per_hit = len(split) # parsed = QuerySyntax.parse_query(qs) # parsed_split = [] # for label,token in parsed: # l,t = label,token # if l == "QUOTE": # subtokens = t[1:-1].split(" ") # parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t] # else: # parsed_split += [(l,t)] # command = Query.format_parsed_query(parsed_split,self) # words_per_hit = len(command.split("\n\n")) return HitList.HitList(search_file, words_per_hit, self) else: if corpus: return corpus else: return self.get_all("doc")