Exemple #1
0
    def __init__(self, doc_path, stopwords=None):
        st = RamStorage()
        st.create()
        schema = Schema(entity1_name=TEXT(stored=True),
                        fieldname=TEXT(stored=True),
                        entity2_name=TEXT(stored=True))
        self.ix = st.create_index(schema)
        writer = self.ix.writer()
        self.remove_stopwords_while_indexing = False
        if stopwords:
            self.remove_stopwords_while_indexing = True
            self.stopwords_dict = read_file_as_dict(stopwords)

        with open(doc_path, 'r') as graph_file:
            reader = csv.DictReader(graph_file,
                                    delimiter="\t",
                                    fieldnames=['e1_relation', 'e2'])
            for row in tqdm(reader):
                entity_relation, e2 = row['e1_relation'], row['e2']
                tokens = entity_relation.split()
                e1 = tokens[1]
                relation = tokens[2]
                writer.add_document(entity1_name=e1,
                                    fieldname=relation,
                                    entity2_name=e2)
        writer.commit()
Exemple #2
0
def index_product_info(product_dict):
    schema = Schema(path=ID(stored=True, analyzer=StemmingAnalyzer()),
                    content=TEXT(stored=True, analyzer=StemmingAnalyzer()))
    st = RamStorage()
    st.create()
    ix = st.create_index(schema)
    writer = ix.writer()
    for key in product_dict.keys():
        writer.add_document(path=key, content=product_dict[key])
    writer.commit(mergetype=writing.CLEAR)
    return ix
def index_product_info(product_dict):
    schema = Schema(path=ID(stored=True, analyzer=StemmingAnalyzer()),
                    content=TEXT(stored=True, analyzer=StemmingAnalyzer()))
    st = RamStorage()
    st.create()
    ix = st.create_index(schema)
    writer = ix.writer()
    for key in product_dict.keys():
        writer.add_document(path=unicode(key, "utf-8"), content=unicode(product_dict[key], "utf-8"))
    writer.commit(mergetype=writing.CLEAR)
    return ix
    def __init__(self, doc_path, stopwords=None):
        st = RamStorage()
        st.create()
        schema = Schema(entity_name=TEXT(stored=True),
                        fieldname=TEXT(stored=True),
                        content=TEXT())
        self.ix = st.create_index(schema)
        writer = self.ix.writer()
        self.remove_stopwords_while_indexing = False
        if stopwords:
            self.remove_stopwords_while_indexing = True
            self.stopwords_dict = read_file_as_dict(stopwords)

        with codecs.open(doc_path, 'r', "utf-8") as doc_file:
            for line in doc_file:
                line = clean_line(line)
                entity_name, fieldname, content = line.split(PIPE)
                writer.add_document(
                    entity_name=entity_name,
                    fieldname=fieldname,
                    content=self.remove_stopwords_from_text(content))
        writer.commit()
Exemple #5
0
class WhooshStore(SAMLStoreBase):
    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in ATTRS.keys():
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in info.pop('entity_attributes').items():
                info[a] = v
        for a, v in info.items():
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in info.keys():
            if not a in self.schema.names():
                del info[a]

        for a, v in info.items():
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return self._collections

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield ATTRS[n]

    def attributes(self):
        return list(self._attributes())

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return list(searcher.lexicon(n))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return self.objects.values()
            else:
                return self.infos.values()

        from whoosh.qparser import QueryParser
        #import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in ATTRS_INV.items():
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return list(lst)
Exemple #6
0
 def temp_storage(self, name=None):
     temp_store = RamStorage()
     return temp_store.create()
Exemple #7
0
class WhooshStore(SAMLStoreBase):

    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v
        for a, v in list(info.items()):
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in list(info.keys()):
            if a not in self.schema.names():
                del info[a]

        for a, v in list(info.items()):
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return b2u(self._collections)

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(list(self.objects.keys()))
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))
Exemple #8
0
 def temp_storage(self, name=None):
     temp_store = RamStorage()
     return temp_store.create()
Exemple #9
0
class MusicLibrary:
    def __init__(self):
        self.__artists = dict()
        self.__albums = dict()
        self.__tracks = list()
        analyzer = NgramWordAnalyzer(minsize=3)
        schema = Schema(
            title=TEXT(analyzer=analyzer, phrase=False),
            artist=TEXT(analyzer=analyzer, phrase=False),
            album=TEXT(analyzer=analyzer, phrase=False),
            id=ID(stored=True),
        )
        self.ram_storage = RamStorage()
        self.ram_storage.create()
        self.ix = self.ram_storage.create_index(schema)

    def add_track_internal(self, track_info, writer):

        artist_id = id_from_tag(track_info.tag.artist)

        if artist_id not in self.__artists:
            artist = Artist(self, track_info.tag.artist)
            self.add_artist(artist)
        else:
            artist = self.__artists[artist_id]

        album_id = id_from_tag(track_info.tag.album)

        if album_id not in [a.id for a in artist.get_albums()]:
            album = Album(self, artist_id, track_info.tag.album, track_info.tag.year)
            if album.id not in self.__albums:
                self.__albums[album.id] = [album]
            else:
                self.__albums[album.id].append(album)
        track_id = id_from_tag(track_info.tag.title)

        if track_id not in [track.id for track in self.get_artist(artist_id).get_album(album_id).get_tracks()]:
            track = Track(self, track_info)
            self.__tracks.append(track)
            track_id = len(self.__tracks) - 1
            writer.add_document(
                title=unicode(track.title),
                artist=unicode(track.artist.name),
                album=unicode(track.album.name),
                id=unicode(track_id),
            )

    def add_track(self, track_info):
        writer = self.ix.writer()
        self.add_track_internal(track_info, writer)
        writer.commit()

    def add_artist(self, artist):
        self.__artists[artist.id] = artist

    def get_artist(self, artist):
        """
        Parametry: string artist - nazwa artysty
        Zwraca artystę o danej nazwie (wcześniej jest castowana przez id_from_tag)
        """
        try:
            return self.__artists[id_from_tag(artist)]
        except KeyError:
            return None

    def search_for_track(self, querystring):
        if len(querystring) >= 3:
            with self.ix.searcher() as searcher:
                collector = searcher.collector(limit=20)
                tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False)
                parser = MultifieldParser(["artist", "album", "title"], self.ix.schema)
                parser.add_plugin(qparser.FuzzyTermPlugin())
                myquery = parser.parse(querystring)
                try:
                    searcher.search_with_collector(myquery, tlc)
                    if len(tlc.results()) == 0:
                        myquery = parser.parse(" ".join(word + "~2" for word in querystring.split()))
                        searcher.search_with_collector(myquery, tlc)
                except TimeLimit:
                    logging.info("Time Limit for query reached!")
                logging.debug("czas zapytania: ", collector.runtime)
                ret = [self.__tracks[int(result["id"])] for result in tlc.results()]
                return ret
        else:
            return []

    def get_track(self, track):
        """
        Parametry: string artist - nazwa utworu
        Zwraca listę utworów o danej nazwie (wcześniej jest castowana przez id_from_tag)
        """
        try:
            return self.__tracks[id_from_tag(track)]
        except KeyError:
            return None

    def get_track_by_filename(self, filename):
        for track in self.__tracks:
            if track.file == filename:
                return track
        logging.error("file %s not found in library" % filename)

    def get_album(self, album):
        """
        Parametry: string artist - nazwa utworu
        Zwraca listę albumów o danej nazwie (wcześniej jest castowana przez id_from_tag)
        """
        try:
            return self.__albums[id_from_tag(album)]
        except KeyError:
            return None

    def get_artists(self):
        return dict(self.__artists)

    def get_albums(self):
        return dict(self.__albums)

    def get_tracks(self):
        return dict(self.__tracks)