Beispiel #1
0
    def word_search(self,qs,*metadata_dicts,**options):
        hash = hashlib.sha1()
        hash.update(self.path)
        hash.update(qs)
        method = options.get("method","")
        method_arg = options.get("method_arg","")
        limit = options.get("limit","")
        hash.update(method)
        hash.update(method_arg)
        hash.update(limit)
        for metadata_level in metadata_dicts:
            for k,v in metadata_level.items():
                hash.update(k)
                hash.update(v)
        hex_hash = hash.hexdigest()
#        print >> sys.stderr, "('%s' %s %d %s )hashes to %s" % (qs,method,method_arg,repr(metadata_dicts),hex_hash)
        hfile = "/var/lib/philologic/hitlists/" + hex_hash + ".hitlist"
        words_per_hit = len(qs.split(" "))
        if os.path.isfile(hfile):
            return HitList.HitList(hfile,words_per_hit) #debug.
        corpus_file = None
        corpus_size = self.width
        corpus_count = 0
 #       print >> sys.stderr, "metadata = %s" % repr(metadata)
        if metadata_dicts:
            corpus_file = "/var/lib/philologic/hitlists/" + hex_hash + ".corpus"
            corpus_fh = open(corpus_file,"wb")
            for c_obj in self.toms.compound_query(*metadata_dicts):
                c_id = [int(x) for x in c_obj["philo_id"].split(" ")]
                corpus_fh.write(struct.pack("=7i",*c_id))
                corpus_count += 1
            corpus_fh.close()
            if corpus_count == 0: return []
  #      print >> sys.stderr, "%d metadata objects" % corpus_count
        return Query.query(self.path,qs,corpus_file,corpus_size,filename=hfile,**options)
Beispiel #2
0
 def query(self,qs,method=None,method_arg=0,**metadata):
     hashable = (qs,method,method_arg,tuple(metadata.items()))
     hash = hashlib.sha1()
     hash.update(self.path)
     hash.update(qs)
     hash.update(method or "")
     hash.update(method_arg or "")
     for key,value in metadata.items():
         hash.update(key)
         hash.update(value)
     hex_hash = hash.hexdigest()
     print >> sys.stderr,"%s hashes to %s" % (hashable,hex_hash)
     #check here to see if the query is cached.
     hfile = "/var/lib/philologic/hitlists/" + hex_hash + ".hitlist"
     words_per_hit = len(qs.split(" "))
     if os.path.isfile(hfile):
         print >> sys.stderr, "%s cached already" % (hashable,)
         return HitList.HitList(hfile,words_per_hit) #debug.
     corpus_file = None
     corpus_size = self.width
     corpus_count = 0
     print >> sys.stderr, "metadata = %s" % repr(metadata)
     if metadata:
         corpus_file = "/var/lib/philologic/hitlists/" + hex_hash + ".corpus"
         corpus_fh = open(corpus_file,"wb")
         for c_obj in self.toms.query(**metadata):
             c_id = [int(x) for x in c_obj["philo_id"].split(" ")]
             corpus_fh.write(struct.pack("=7i",*c_id))
             corpus_count += 1
         corpus_fh.close()
         if corpus_count == 0: return []
     print >> sys.stderr, "%d metadata objects" % corpus_count
     return Query.query(self.path,qs,corpus_file,corpus_size,method,method_arg,filename=hfile)
Beispiel #3
0
    def query(self,
              qs="",
              method="",
              method_arg=0,
              limit=10000000,
              **metadata):
        """query the PhiloLogic database"""
        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key, value in metadata.items():
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key, "|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = "/var/lib/philologic/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [
                    {} for level in self.locals["metadata_hierarchy"]
                ]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in metadata.items():
                    for i, params in enumerate(
                            self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = [
                                        '"div"|"div1"|"div2"|"div3"'
                                    ]
                                else:
                                    metadata_dicts[i]["philo_type"] = [
                                        '"%s"' %
                                        self.locals["metadata_types"][k]
                                    ]
                metadata_dicts = [d for d in metadata_dicts if d]
                corpus = MetadataQuery.metadata_query(self, corpus_file,
                                                      metadata_dicts)
            else:
                #                print >> sys.stderr, "cached @ %s" % corpus_file
                corpus = HitList.HitList(corpus_file, 0, self)
                corpus.finish()
            print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            words_per_hit = len(qs.split(" "))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = "/var/lib/philologic/hitlists/" + search_hash + ".hitlist"
            if not os.path.isfile(search_file):
                return Query.query(self,
                                   qs,
                                   corpus_file,
                                   self.width,
                                   method,
                                   method_arg,
                                   limit,
                                   filename=search_file)
            else:
                return HitList.HitList(search_file, words_per_hit, self)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc")
Beispiel #4
0
    def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], **metadata):
        """query the PhiloLogic database"""
        method = method or "proxy"
        if isinstance(method_arg, str):
            try:
                method_arg = int(method_arg)
            except:
                if method == "cooc" or method == "sentence":
                    method_arg = 6
                else:
                    method_arg = 0

        if isinstance(limit, str):
            try:
                limit = int(limit)
            except:
                limit = 10000000

        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key, value in metadata.items():
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key, "|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in metadata.items():
                    for i, params in enumerate(self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"']
                                else:
                                    metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append({"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order)
            else:
                #                print >> sys.stderr, "cached @ %s" % corpus_file
                if sort_order == ["rowid"]:
                    sort_order = None
                corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order)
                corpus.finish()
            #print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            #            words_per_hit = len(qs.split(" "))
            #            words_per_hit = len(qs.split("\n\n"))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if sort_order == ["rowid"]:
                sort_order = None
            if not os.path.isfile(search_file):
                return Query.query(self, qs, corpus_file, self.width, method, method_arg, limit, filename=search_file, sort_order=sort_order)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc", sort_order)
Beispiel #5
0
    def query(self,qs="",method="",method_arg=0,limit=10000000,**metadata):
        """query the PhiloLogic database"""
        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key,value in metadata.items():
            if isinstance(value,str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key,"|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]]
#                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k,v in metadata.items():
                    for i, params in enumerate(self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"']
                                else:
                                    metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]]
                metadata_dicts = [d for d in metadata_dicts if d]
                corpus = MetadataQuery.metadata_query(self,corpus_file,metadata_dicts)
            else:
#                print >> sys.stderr, "cached @ %s" % corpus_file
                corpus = HitList.HitList(corpus_file,0,self)
                corpus.finish()             
            #print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            words_per_hit = len(qs.split(" "))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if not os.path.isfile(search_file):
                return Query.query(self,qs,corpus_file,self.width,method,method_arg,limit,filename=search_file)
            else:
                return HitList.HitList(search_file,words_per_hit,self)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc")
Beispiel #6
0
    def query(self, qs="", method="", method_arg="", limit="", sort_order=["rowid"], raw_results=False, **metadata):
        """query the PhiloLogic database"""
        method = method or "proxy"
        if isinstance(method_arg, str):
            try:
                method_arg = int(method_arg)
            except:
                if method == "cooc" or method == "sentence":
                    method_arg = 6
                else:
                    method_arg = 0

        if isinstance(limit, str):
            try:
                limit = int(limit)
            except:
                limit = 10000000

        hash = hashlib.sha1()
        hash.update(self.path.encode('utf8'))
        has_metadata = False
        corpus_file = None

        for key, value in list(metadata.items()):
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                key_value = "%s=%s" % (key, "|".join(value))
                hash.update(key_value.encode('utf8'))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [{} for level in self.locals["metadata_hierarchy"]]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in list(metadata.items()):
                    for i, params in enumerate(self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = ['"div"|"div1"|"div2"|"div3"']
                                else:
                                    metadata_dicts[i]["philo_type"] = ['"%s"' % self.locals["metadata_types"][k]]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append({"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file, metadata_dicts, sort_order)
            else:
                if sort_order == ["rowid"]:
                    sort_order = None
                corpus = HitList.HitList(corpus_file, 0, self, sort_order=sort_order, raw=raw_results)
                corpus.finish()
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            hash.update(qs.encode('utf8'))
            hash.update(method.encode('utf8'))
            hash.update(str(method_arg).encode('utf8'))
            hash.update(str(limit).encode('utf8'))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if sort_order == ["rowid"]:
                sort_order = None
            if not os.path.isfile(search_file):
                return Query.query(self,
                                   qs,
                                   corpus_file,
                                   self.width,
                                   method,
                                   method_arg,
                                   limit,
                                   filename=search_file,
                                   sort_order=sort_order,
                                   raw_results=raw_results)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                return HitList.HitList(search_file, words_per_hit, self, sort_order=sort_order, raw=raw_results)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all(self.locals["default_object_level"], sort_order)
Beispiel #7
0
    def query(self,
              qs="",
              method="",
              method_arg=0,
              limit=10000000,
              **metadata):
        """query the PhiloLogic database"""
        hash = hashlib.sha1()
        hash.update(self.path)
        has_metadata = False
        corpus_file = None

        for key, value in metadata.items():
            if isinstance(value, str):
                if value == "":
                    pass
                else:
                    value = [value]
                    metadata[key] = value
            value = [v for v in value if v]
            if value:
                has_metadata = True
                hash.update("%s=%s" % (key, "|".join(value)))

        if has_metadata:
            corpus_hash = hash.hexdigest()
            corpus_file = self.path + "/hitlists/" + corpus_hash + ".hitlist"
            corpus_width = 7

            if not os.path.isfile(corpus_file):
                # before we query, we need to figure out what type each parameter belongs to,
                # and sort them into a list of dictionaries, one for each type.
                metadata_dicts = [
                    {} for level in self.locals["metadata_hierarchy"]
                ]
                #                print >> sys.stderr, "querying %s" % repr(metadata.items())
                for k, v in metadata.items():
                    for i, params in enumerate(
                            self.locals["metadata_hierarchy"]):
                        if v and (k in params):
                            metadata_dicts[i][k] = v
                            if k in self.locals["metadata_types"]:
                                this_type = self.locals["metadata_types"][k]
                                if this_type == "div":
                                    metadata_dicts[i]["philo_type"] = [
                                        '"div"|"div1"|"div2"|"div3"'
                                    ]
                                else:
                                    metadata_dicts[i]["philo_type"] = [
                                        '"%s"' %
                                        self.locals["metadata_types"][k]
                                    ]
                metadata_dicts = [d for d in metadata_dicts if d]
                if "philo_id" in metadata:
                    if metadata_dicts:
                        metadata_dicts[-1]["philo_id"] = metadata["philo_id"]
                    else:
                        metadata_dicts.append(
                            {"philo_id": metadata["philo_id"]})
                corpus = MetadataQuery.metadata_query(self, corpus_file,
                                                      metadata_dicts)
            else:
                #                print >> sys.stderr, "cached @ %s" % corpus_file
                corpus = HitList.HitList(corpus_file, 0, self)
                corpus.finish()
            #print >> sys.stderr, "corpus file of length %d" % len(corpus)
            if len(corpus) == 0:
                return corpus
        else:
            corpus = None
        if qs:
            #            words_per_hit = len(qs.split(" "))
            #            words_per_hit = len(qs.split("\n\n"))
            hash.update(qs)
            hash.update(method)
            hash.update(str(method_arg))
            hash.update(str(limit))
            search_hash = hash.hexdigest()
            search_file = self.path + "/hitlists/" + search_hash + ".hitlist"
            if not os.path.isfile(search_file):
                return Query.query(self,
                                   qs,
                                   corpus_file,
                                   self.width,
                                   method,
                                   method_arg,
                                   limit,
                                   filename=search_file)
            else:
                parsed = QuerySyntax.parse_query(qs)
                grouped = QuerySyntax.group_terms(parsed)
                split = Query.split_terms(grouped)
                words_per_hit = len(split)
                #                parsed = QuerySyntax.parse_query(qs)
                #                parsed_split = []
                #                for label,token in parsed:
                #                    l,t = label,token
                #                    if l == "QUOTE":
                #                        subtokens = t[1:-1].split(" ")
                #                        parsed_split += [("QUOTE_S",sub_t) for sub_t in subtokens if sub_t]
                #                    else:
                #                        parsed_split += [(l,t)]
                #                command = Query.format_parsed_query(parsed_split,self)
                #                words_per_hit = len(command.split("\n\n"))

                return HitList.HitList(search_file, words_per_hit, self)
        else:
            if corpus:
                return corpus
            else:
                return self.get_all("doc")