def __init__(self, doc_path, stopwords=None): st = RamStorage() st.create() schema = Schema(entity1_name=TEXT(stored=True), fieldname=TEXT(stored=True), entity2_name=TEXT(stored=True)) self.ix = st.create_index(schema) writer = self.ix.writer() self.remove_stopwords_while_indexing = False if stopwords: self.remove_stopwords_while_indexing = True self.stopwords_dict = read_file_as_dict(stopwords) with open(doc_path, 'r') as graph_file: reader = csv.DictReader(graph_file, delimiter="\t", fieldnames=['e1_relation', 'e2']) for row in tqdm(reader): entity_relation, e2 = row['e1_relation'], row['e2'] tokens = entity_relation.split() e1 = tokens[1] relation = tokens[2] writer.add_document(entity1_name=e1, fieldname=relation, entity2_name=e2) writer.commit()
def index_product_info(product_dict): schema = Schema(path=ID(stored=True, analyzer=StemmingAnalyzer()), content=TEXT(stored=True, analyzer=StemmingAnalyzer())) st = RamStorage() st.create() ix = st.create_index(schema) writer = ix.writer() for key in product_dict.keys(): writer.add_document(path=key, content=product_dict[key]) writer.commit(mergetype=writing.CLEAR) return ix
def index_product_info(product_dict): schema = Schema(path=ID(stored=True, analyzer=StemmingAnalyzer()), content=TEXT(stored=True, analyzer=StemmingAnalyzer())) st = RamStorage() st.create() ix = st.create_index(schema) writer = ix.writer() for key in product_dict.keys(): writer.add_document(path=unicode(key, "utf-8"), content=unicode(product_dict[key], "utf-8")) writer.commit(mergetype=writing.CLEAR) return ix
def __init__(self, doc_path, stopwords=None): st = RamStorage() st.create() schema = Schema(entity_name=TEXT(stored=True), fieldname=TEXT(stored=True), content=TEXT()) self.ix = st.create_index(schema) writer = self.ix.writer() self.remove_stopwords_while_indexing = False if stopwords: self.remove_stopwords_while_indexing = True self.stopwords_dict = read_file_as_dict(stopwords) with codecs.open(doc_path, 'r', "utf-8") as doc_file: for line in doc_file: line = clean_line(line) entity_name, fieldname, content = line.split(PIPE) writer.add_document( entity_name=entity_name, fieldname=fieldname, content=self.remove_stopwords_from_text(content)) writer.commit()
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in info.pop('entity_attributes').items(): info[a] = v for a, v in info.items(): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in info.keys(): if not a in self.schema.names(): del info[a] for a, v in info.items(): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return self._collections def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield ATTRS[n] def attributes(self): return list(self._attributes()) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return list(searcher.lexicon(n)) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return self.objects.values() else: return self.infos.values() from whoosh.qparser import QueryParser #import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in ATTRS_INV.items(): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return list(lst)
def temp_storage(self, name=None): temp_store = RamStorage() return temp_store.create()
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v for a, v in list(info.items()): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in list(info.keys()): if a not in self.schema.names(): del info[a] for a, v in list(info.items()): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return b2u(self._collections) def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(list(self.objects.keys())) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))
class MusicLibrary: def __init__(self): self.__artists = dict() self.__albums = dict() self.__tracks = list() analyzer = NgramWordAnalyzer(minsize=3) schema = Schema( title=TEXT(analyzer=analyzer, phrase=False), artist=TEXT(analyzer=analyzer, phrase=False), album=TEXT(analyzer=analyzer, phrase=False), id=ID(stored=True), ) self.ram_storage = RamStorage() self.ram_storage.create() self.ix = self.ram_storage.create_index(schema) def add_track_internal(self, track_info, writer): artist_id = id_from_tag(track_info.tag.artist) if artist_id not in self.__artists: artist = Artist(self, track_info.tag.artist) self.add_artist(artist) else: artist = self.__artists[artist_id] album_id = id_from_tag(track_info.tag.album) if album_id not in [a.id for a in artist.get_albums()]: album = Album(self, artist_id, track_info.tag.album, track_info.tag.year) if album.id not in self.__albums: self.__albums[album.id] = [album] else: self.__albums[album.id].append(album) track_id = id_from_tag(track_info.tag.title) if track_id not in [track.id for track in self.get_artist(artist_id).get_album(album_id).get_tracks()]: track = Track(self, track_info) self.__tracks.append(track) track_id = len(self.__tracks) - 1 writer.add_document( title=unicode(track.title), artist=unicode(track.artist.name), album=unicode(track.album.name), id=unicode(track_id), ) def add_track(self, track_info): writer = self.ix.writer() self.add_track_internal(track_info, writer) writer.commit() def add_artist(self, artist): self.__artists[artist.id] = artist def get_artist(self, artist): """ Parametry: string artist - nazwa artysty Zwraca artystę o danej nazwie (wcześniej jest castowana przez id_from_tag) """ try: return self.__artists[id_from_tag(artist)] except KeyError: return None def search_for_track(self, querystring): if len(querystring) >= 3: with self.ix.searcher() as searcher: collector = searcher.collector(limit=20) tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False) parser = MultifieldParser(["artist", "album", "title"], self.ix.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) myquery = parser.parse(querystring) try: searcher.search_with_collector(myquery, tlc) if len(tlc.results()) == 0: myquery = parser.parse(" ".join(word + "~2" for word in querystring.split())) searcher.search_with_collector(myquery, tlc) except TimeLimit: logging.info("Time Limit for query reached!") logging.debug("czas zapytania: ", collector.runtime) ret = [self.__tracks[int(result["id"])] for result in tlc.results()] return ret else: return [] def get_track(self, track): """ Parametry: string artist - nazwa utworu Zwraca listę utworów o danej nazwie (wcześniej jest castowana przez id_from_tag) """ try: return self.__tracks[id_from_tag(track)] except KeyError: return None def get_track_by_filename(self, filename): for track in self.__tracks: if track.file == filename: return track logging.error("file %s not found in library" % filename) def get_album(self, album): """ Parametry: string artist - nazwa utworu Zwraca listę albumów o danej nazwie (wcześniej jest castowana przez id_from_tag) """ try: return self.__albums[id_from_tag(album)] except KeyError: return None def get_artists(self): return dict(self.__artists) def get_albums(self): return dict(self.__albums) def get_tracks(self): return dict(self.__tracks)