class _DefaultSearchSchema(SchemaClass): """ General search schema """ object_key = ID(stored=True, unique=True) id = NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=False) object_type = ID(stored=True, unique=False) creator = ID(stored=True) owner = ID(stored=True) #: security index. This list roles and user/group ids allowed to *see* this #: content allowed_roles_and_users = KEYWORD(stored=True) #: tags indexing tag_ids = KEYWORD(stored=True) tag_text = TEXT(stored=False, analyzer=accent_folder) # hierarchical index of ids path ('/' is the separator) parent_ids = FieldType(format=Existence(), analyzer=PathTokenizer(), stored=True, unique=False) name = TEXT(stored=True, analyzer=accent_folder) slug = ID(stored=True) description = TEXT(stored=True, analyzer=accent_folder) text = TEXT(stored=False, analyzer=accent_folder)
def esquema_equipo(listaEquipos): #definimos el esquema de la información schem = Schema(codigo=TEXT(stored=True), nombre=KEYWORD(stored=True), urlEquipo=TEXT(stored=True), categoria=KEYWORD(stored=True)) #si ya existe el directorio del índice, lo eliminamos if os.path.exists("Index"): shutil.rmtree("Index") os.mkdir("Index") #creamos el índice ix = create_in("Index", schema=schem) #creamos un writer para poder añadir documentos al índice writer = ix.writer() for equipo in listaEquipos: #Ahora añadimos cada elemento de la lista de equipos obtenidos al índice que hemos creado writer.add_document(codigo=str(equipo[0]), nombre=str(equipo[1]), urlEquipo=str(equipo[2]), categoria=str(equipo[3])) writer.commit()
def esquema_detalles_equipo(datos): #Vamos a crear dos esquemas, uno para los datos del equipo, y otro para los jugadores #ESQUEMA DE LOS DATOS DEL EQUIPO schem = Schema(codEquipo=TEXT(stored=True), nombre=KEYWORD(stored=True), domicilio=KEYWORD(stored=True), localidad=TEXT(stored=True), provincia=TEXT(stored=True), codPostal=TEXT(stored=True), email=TEXT(stored=True), key=TEXT(stored=True)) if os.path.exists("Index_equipo"): shutil.rmtree("Index_equipo") os.mkdir("Index_equipo") ix = create_in("Index_equipo", schema=schem) writer = ix.writer() writer.add_document(codEquipo=str(datos[6]), nombre=str(datos[0]), domicilio=str(datos[1]), localidad=str(datos[2]), provincia=str(datos[3]), codPostal=str(datos[4]), email=str(datos[5]), key="equipo") writer.commit() #ESQUEMA DE LOS JUGADORES #El nombre es KEYWORD porque puede ser un nombre compuesto schemJugadores = Schema(nombre=KEYWORD(stored=True), apellidos=KEYWORD(stored=True), equipo=TEXT(stored=True)) jugadores = datos[7] nombre = "" apellidos = "" if os.path.exists("Index_jugadores"): shutil.rmtree("Index_jugadores") os.mkdir("Index_jugadores") ix = create_in("Index_jugadores", schema=schemJugadores) writerJugadores = ix.writer() for jugador in jugadores: jugadorApellidosNombre = jugador.split(",") nombre = jugadorApellidosNombre[1] apellidos = jugadorApellidosNombre[0] writerJugadores.add_document(nombre=str(nombre), apellidos=str(apellidos), equipo=str(datos[6])) writerJugadores.commit()
def make_schema(): return Schema( paper_field=KEYWORD(stored=True, lowercase=True, scorable=True), title=TEXT(stored=True, analyzer=StemmingAnalyzer()), authors=KEYWORD(stored=True, lowercase=True), pdf=ID(stored=True), abstract=TEXT(stored=True, analyzer=StemmingAnalyzer()), date=DATETIME(stored=True), )
def jugadoresList(request, id_equipo): if request.user.is_authenticated == False: return redirect('/login') schemJugadores = Schema(id=NUMERIC(int, stored=True), nombre=KEYWORD(stored=True), apellidos=KEYWORD(stored=True), posicion=KEYWORD(stored=True), equipo=TEXT(stored=True)) eq = get_object_or_404(Equipo, pk=id_equipo) jugadores = Jugador.objects.filter(equipo=id_equipo) if os.path.exists("Index_jugadores"): shutil.rmtree("Index_jugadores") os.mkdir("Index_jugadores") ix = create_in("Index_jugadores", schema=schemJugadores) writerJugadores = ix.writer() for jugador in jugadores: writerJugadores.add_document(id=int(jugador.id), nombre=str(jugador.nombre), apellidos=str(jugador.apellidos), posicion=str(jugador.posicionPrincipal), equipo=str(jugador.equipo)) writerJugadores.commit() if request.method == 'POST': posicion = request.POST['posicion'] ix_jugadores = open_dir("Index_jugadores") jugadores = [] with ix_jugadores.searcher() as searcherJugadores: consulta = str(posicion) + " " + str(eq) query = MultifieldParser(["posicion", "equipo"], ix_jugadores.schema, group=AndGroup).parse(str(consulta)) #ponemos límite None para que nos devuelva todos los resultados diccionariosJugadores = searcherJugadores.search(query, limit=None) auxJugadores = {} for dicJugadores in diccionariosJugadores: auxJugadores = { 'id': dicJugadores['id'], 'nombre': dicJugadores['nombre'], 'apellidos': dicJugadores['apellidos'], 'posicionPrincipal': dicJugadores['posicion'] } jugadores.append(auxJugadores) return render(request, 'principal/jugadores.html', { 'jugadores': jugadores, 'eq': eq })
def getSchema(): """ Crea y devuelve un esquema para la búsqueda""" return Schema(jornada=KEYWORD(stored=True), equipos=TEXT(stored=True), resultado=TEXT(stored=True), fecha=KEYWORD(stored=True), autor=TEXT(stored=True), titular=TEXT(stored=True), titulo=KEYWORD(stored=True), cuerpo=TEXT())
def schemaSerie(): schem = Schema(idSerie=ID(stored=True, unique=True), titulo=TEXT(stored=True), tituloOriginal=TEXT(stored=True), imdb=TEXT(stored=True), fechaEstreno=TEXT(stored=True), poster=TEXT(stored=True), temporadas=TEXT(stored=True), generos=KEYWORD(stored=True, commas=True), plataformas=KEYWORD(stored=True, commas=True), links=KEYWORD(stored=True, commas=True)) return schem
class CardIndexSchema(SchemaClass): id = NUMERIC(unique=True) name = KEYWORD() type = KEYWORD() layout = TEXT() text = TEXT colors = KEYWORD(commas=True) costs = NUMERIC() power = TEXT() toughness = TEXT() availability = NUMERIC() card = STORED
def _get_schema(self, language): lang_analyzer = LanguageAnalyzer(language) return Schema( key=ID(stored=True, unique=True), assignee=ID(stored=True), reporter=ID(stored=True), status=ID(stored=True), summary=TEXT(analyzer=lang_analyzer, field_boost=2.0), description=TEXT(analyzer=lang_analyzer), comments_str=TEXT(analyzer=lang_analyzer), labels=KEYWORD(stored=True, lowercase=True), components=KEYWORD(stored=True, lowercase=True), )
def get_schema(): return Schema(id=ID(stored=True), name=NGRAMWORDS(stored=True, minsize=2, maxsize=12, at='start', queryor=True), display=TEXT(stored=True), zvalue=NUMERIC(stored=True), kind=KEYWORD(stored=True), sumlevel=KEYWORD(stored=True), is_stem=NUMERIC(stored=True), url_name=TEXT(stored=True))
def __get_index_schema(self): """ :return: organization index schema """ return Schema(id=NUMERIC(stored=True), url=ID(stored=True), external_id=ID(stored=True), name=ID(stored=True), domain_names=KEYWORD(stored=True, commas=True), created_at=ID(stored=True), details=ID(stored=True), shared_tickets=BOOLEAN(stored=True), tags=KEYWORD(stored=True, commas=True))
def schemaPelicula(): schem = Schema(idPelicula=ID(stored=True, unique=True), titulo=TEXT(stored=True), tituloOriginal=TEXT(stored=True), imdb=TEXT(stored=True), fechaEstreno=TEXT(stored=True), poster=TEXT(stored=True), duracion=TEXT(stored=True), director=TEXT(stored=True), generos=KEYWORD(stored=True, commas=True), plataformas=KEYWORD(stored=True, commas=True), links=KEYWORD(stored=True, commas=True)) return schem
def create_index(self, offering): """ Create a document entry for the offering in the search index """ # Check if the index already exists to avoid overwrite it if not os.path.exists(self._index_path) or os.listdir( self._index_path) == []: # Create dir if needed if not os.path.exists(self._index_path): os.makedirs(self._index_path) # Create schema schema = Schema(id=KEYWORD(stored=True, unique=True), owner=KEYWORD, content=TEXT, name=KEYWORD(sortable=True), popularity=NUMERIC(int, decimal_places=2, sortable=True, signed=False), date=DATETIME(sortable=True), state=KEYWORD, purchaser=KEYWORD(stored=True, commas=True)) # Create index index = create_in(self._index_path, schema) else: index = open_dir(self._index_path) # Open the index index_writer = index.writer() # Aggregate all the information included in the USDL document in # a single string in order to add a new document to the index text = self._aggregate_text(offering) purchasers_text = self._aggregate_purchasers(offering) # Add the new document index_writer.add_document(id=unicode(offering.pk), owner=unicode( offering.owner_organization.pk), content=unicode(text), name=unicode(offering.name), popularity=Decimal(offering.rating), date=offering.creation_date, state=unicode(offering.state), purchaser=purchasers_text) index_writer.commit()
def _mail_schema(self): return Schema(ident=ID(stored=True, unique=True), sender=ID(stored=False), to=KEYWORD(stored=False, commas=True), cc=KEYWORD(stored=False, commas=True), bcc=KEYWORD(stored=False, commas=True), subject=NGRAMWORDS(stored=False), date=NUMERIC(stored=False, sortable=True, bits=64, signed=False), body=NGRAMWORDS(stored=False), tag=KEYWORD(stored=True, commas=True), flags=KEYWORD(stored=True, commas=True), raw=TEXT(stored=False))
def create_searchable_database(root, txt_files=True): """ Loading all files in corpus to the :param root (string): Directory of corpus """ index_dir = os.path.join(root, "index_dir") if txt_files: corpus_dir = os.path.join(root, "txt_dir") else: corpus_dir = os.path.join(root, "json_dir") while True: inpt = input( 'Warning! This will remove the current index_dir. Type "ok" to continue,' ' or "exit" to abort: \n') if inpt == "ok": break elif inpt == "exit": exit() if os.path.exists(index_dir): shutil.rmtree(index_dir) os.mkdir(index_dir) schema = Schema(title=TEXT(stored=True), keywords=KEYWORD(stored=True, scorable=True, commas=True), content=TEXT(stored=True)) ix = create_in(index_dir, schema) writer = ix.writer() if txt_files: add_txt_documents(writer, corpus_dir) else: add_json_documents(writer, corpus_dir) writer.commit()
def minaceSchema(): """ Defining a basic schema for the index. fields: image: path ccompsHead: Components in the middle of a transcribed word, or single-grams if thei are not in ('s', 'e', 'l', 'm') ccompsTail: Ending tokens ccompsHeadTrace: Positional index of the tokens and their compounds """ return Schema( image=ID(stored=True, unique=True), ccompsHead=KEYWORD(stored=True, sortable=True), ccompsTail=KEYWORD(stored=True, sortable=True), ccompsHeadTrace=STORED, )
def get_schema(self): return Schema(nid=ID(unique=True, stored=True), url=ID(unique=True, stored=True), title=TEXT(phrase=False), tags=KEYWORD(lowercase=True, commas=True, scorable=True), note=TEXT(analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer))
def __init__(self, config): self.schema = Schema( id=ID(unique=True), title=TEXT(stored=True, field_boost=3.0, analyzer=StandardAnalyzer() | NgramFilter(minsize=2, maxsize=3)), author=TEXT(stored=True), creation_date=DATETIME(stored=True), pages=STORED, content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), lang=TEXT(stored=True), size=STORED, tags=KEYWORD(stored=True, commas=True) ) self.index_path = config['WHOOSH_INDEX'] if not os.path.exists(self.index_path): os.mkdir(self.index_path) create_in(self.index_path, self.schema) self.indexer = open_dir(self.index_path) self.parser_content = MultifieldParser(["title", "content"], schema=self.schema) self.parser_content.add_plugin(DateParserPlugin()) self.date_format = { 'last_24h': u'-24h to now', 'last_week': u'last week', 'last_month_to_now': u'-1mo to now', 'last_year_to_now': u"[-2yrs to now]" }
def get_whoosh_index(force_create=False): from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer from whoosh.support.charset import accent_map analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4) schema = Schema( title = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), abstract = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), path = ID(unique=True, stored=True), authors = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), content = TEXT(analyzer=analyzer, stored=True), tags = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), status = KEYWORD, classname = KEYWORD, typeahead = TEXT(spelling=True, stored=True, phrase=False) ) if not os.path.exists(settings.WHOOSH_ROOT): os.mkdir(settings.WHOOSH_ROOT) if not exists_in(settings.WHOOSH_ROOT) or force_create: index = create_in(settings.WHOOSH_ROOT, schema) else: index = open_dir(settings.WHOOSH_ROOT) return index
def get_schema_temas(): return Schema(titulo=TEXT(stored=True), link_tema=ID(unique=True, stored=True), autor=KEYWORD(stored=True), fecha=DATETIME(stored=True), n_respuestas=STORED, n_visitas=STORED)
def indexar_datos_ttl(): #isbn como texto debido a que supera el rango de whoosh esquema = Schema(id=NUMERIC(stored=True), isbn=TEXT, titulo=TEXT, autor=TEXT, genero=KEYWORD(commas=True), descripcion=TEXT, fechapublicacion=DATETIME, precio=NUMERIC(numtype=float)) if os.path.exists("IndexTtl"): shutil.rmtree("IndexTtl") os.mkdir("IndexTtl") ix = create_in("IndexTtl", schema=esquema) writer = ix.writer() it = 0 for libro in TodosTusLibros.objects.all(): writer.add_document(id=libro.pk, isbn=str(libro.isbn), autor=libro.autor, genero=libro.categorias.replace(" ", ""), descripcion=libro.descripcion, fechapublicacion=libro.fechapublicacion, precio=float(libro.precio.replace("€", ""))) it += 1 writer.commit() return it
def search_terms(self, keyword, definition, flag): if not os.path.exists("indexdir"): os.mkdir("indexdir") schema = Schema(title=TEXT(stored=True), content=TEXT(stored = True), \ subjective=KEYWORD(stored=True, lowercase=True, scorable=True)) ix = index.create_in("indexdir", schema) with open('./data/sample.txt') as f: texts = list(f) with open('./data/sample-title.txt') as f2: titles = list(f2) with open('./data/subs.txt') as f3: subs = list(f3) writer = ix.writer() for i in range(len(titles)): writer.add_document(title=titles[i], content=texts[i][1:-2], subjective=subs[i]) writer.commit() s = ix.searcher() if flag: query = QueryParser("content", ix.schema).parse(keyword) else: query = QueryParser("subjective", ix.schema).parse(definition) results = s.search(query, terms=True, limit=20) return results
class OntologyContentSchema(SchemaClass): repo = ID(stored=True) spreadsheet = ID(stored=True) class_id = ID(stored=True) label = TEXT(stored=True) definition = TEXT(stored=True) parent = KEYWORD(stored=True)
def __init__(self, search_term: str): self.schema = Schema( educational_requirements=TEXT(), employment_type=ID(), experience_requirements=TEXT(), industry=KEYWORD(), organization=ID(stored=True), title=TEXT(stored=True), url=STORED(), parent_identifier=NUMERIC(stored=True), # Paragraph Data Children type=ID(stored=True), parent=NUMERIC(), paragraph_number=NUMERIC(stored=True), paragraph_heading=TEXT(analyzer=Analyzing.ImprovedTokenizer(), stored=True), paragraph_content=TEXT(analyzer=Analyzing.ImprovedTokenizer(), stored=True)) self.index_path: str = os.path.join(definitions.MAIN_PATH, "Storage", "Indexe", search_term) FileHandler.if_folder_not_existent_create(self.index_path) self.ix: Index = None self.writer: IndexWriter = None
def test_groupedby_empty_field(self): schema = Schema( unique_id=ID(stored=True, unique=True), id=ID(stored=True), type=ID(stored=True), status=KEYWORD(stored=True), content=TEXT(stored=True), ) ix = index.create_in(self.index_dir, schema=schema) with ix.writer() as w: w.add_document(unique_id=u"1", type=u"type1") w.add_document(unique_id=u"2", type=u"type2", status=u"New") facet_fields = (u"type", u"status") groupedby = facet_fields with ix.searcher() as s: r = s.search( query.Every(), groupedby=groupedby, maptype=sorting.Count, ) facets = self._load_facets(r) self.assertEquals( { 'status': { None: 1, 'New': 1 }, 'type': { 'type1': 1, 'type2': 1 } }, facets)
def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', False)) self._name = kwargs.pop('name', config.store_name) self._scheduler = kwargs.pop('scheduler', None) if self._scheduler is None: self._scheduler = make_default_scheduler() self._scheduler.start() if clear: shutil.rmtree(self._dir) self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._redis = kwargs.pop('redis', None) if self._redis is None: self._redis = StrictRedis(host=config.redis_host, port=config.redis_port) now = datetime.now() self._last_index_time = now self._last_modified = now self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex()
def __init__(self, prep_dsets, metadata_dir, process_pool, keywords_wildcard='*'): """ Initializes the class and sets the list of supported datasets. Arguments: metadata_dir: Directory where to look for the metadata files. prep_dsets: Dictionary of supported datasets. The keys of this dictionary are used to find the subfolder within metadata_dir where the metadata of each dataset should be stored. process_pool: instance of CpProcessPool, used to support multi-threading keywords_wildcard: wildcard character for keyword-based search. It should be JUST ONE CHARACTER and cannot be '#'. """ self.fname2meta = {} self.keyword2fname = {} self.metadata_dir = metadata_dir self.process_pool = process_pool self.keywords_wildcard = keywords_wildcard # load metadata for each dataset self.metaindex = None self.is_all_metadata_loaded = False found_a_csv = False for (dset, pretty) in prep_dsets.items(): self.fname2meta[dset] = {} self.keyword2fname[dset] = {} try: # check there is at least one csv if not found_a_csv: for afile in os.listdir( os.path.join(self.metadata_dir, dset)): if afile.endswith(".csv"): found_a_csv = True break # create index, if not present self.index_dir = os.path.join(self.metadata_dir, 'indexdir') create_index = False if found_a_csv and not os.path.exists(self.index_dir): os.mkdir(self.index_dir) schema = Schema( key=KEYWORD(stored=True), dataset=TEXT) # In the future, this migth be needed if # using multiple datasets self.metaindex = create_in(self.index_dir, schema) create_index = True # load the old one, if found if found_a_csv and os.path.exists(self.index_dir): self.metaindex = open_dir(self.index_dir) # start thread to load all metadata self.process_pool.apply_async(func=self.load_all_dset_metadata, args=( dset, create_index, )) except Exception as e: print("Error while pre-loading metadata for " + dset + ": " + str(e) + '\n')
class MangaSchema(SchemaClass): title = TEXT(stored=True) author = TEXT artist = TEXT description = TEXT tags = KEYWORD(lowercase=True, commas=True, scorable=True) completed = BOOLEAN url = ID(stored=True)
def get_schema(): return Schema(titulo=TEXT(stored=True), plataformas=TEXT(stored=True), desarrollador=ID(stored=True), generos=TEXT(stored=True), url_juego=ID(stored=True), jugadores=KEYWORD(stored=True), url_imagen=ID(stored=True))
def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict()