class NoteSearchService: _instance_lock = threading.Lock() def __init__(self): self.schema = Schema(note_id=NUMERIC(stored=True, unique=True), notebook_id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=ChineseAnalyzer()), snippet=TEXT(analyzer=ChineseAnalyzer())) try: self.index = FileStorage(config.get("PATH", "notes_index_dir")).open_index() except: self.index = FileStorage(config.get("PATH", "notes_index_dir")).create_index(self.schema) def __new__(cls, *args, **kwargs): if not hasattr(NoteSearchService, "_instance"): with NoteSearchService._instance_lock: if not hasattr(NoteSearchService, "_instance"): NoteSearchService._instance = super(NoteSearchService, cls).__new__(cls, *args, **kwargs) return NoteSearchService._instance def bulk_update(self, notes): writer = self.index.writer(procs=4,multisegment=True) for note in notes: writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.commit() def add(self, note): writer = AsyncWriter(self.index) writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.commit() def update(self, note): writer = BufferedWriter(self.index, period=10, limit=10) writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.close() def delete(self, note_id): writer = BufferedWriter(self.index) writer.delete_by_term('note_id', note_id) writer.close() def search(self, query_string, notebook_id=None): with AsyncWriter(self.index).searcher() as searcher: query_parser = MultifieldParser(["title", "snippet"], schema=self.index.schema).parse(query_string) notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None results = searcher.search(query_parser, filter=notebook_filter, limit=None) return [res['note_id'] for res in results]
def populateDB(i): if i == 0: deleteCompleteBD() if not os.path.exists(dirindex): os.mkdir(dirindex) create_in(dirindex, schema=get_schema()) ix = FileStorage(dirindex).open_index() writer = ix.writer() f = urllib.request.urlopen("https://myanimelist.net/topanime.php?limit=" + str(50*i)) s = BeautifulSoup(f, "html.parser") lista_animes = s.find("table", class_="top-ranking-table").find_all("a", class_="hoverinfo_trigger fl-l ml12 mr8") for lista_anime in lista_animes: ix = FileStorage(dirindex).open_index() try: f = urllib.request.urlopen(lista_anime['href']) s = BeautifulSoup(f, "html.parser") titulo = s.find("h1", class_="title-name h1_bold_none").text imagen = s.find("td", class_="borderClass").next_element.next_element.next_element.find('img')['data-src'] rango_web = s.find("div", class_="di-ib ml12 pl20 pt8").contents[0].text.split()[1] popularidad_web = s.find("div", class_="di-ib ml12 pl20 pt8").contents[1].text.split()[1] emision = ''.join(s.find("td", class_="borderClass").next_element.find_all('div', class_="spaceit")[1].stripped_strings)[6:].split() episodios = ''.join(s.find("td", class_="borderClass").next_element.find_all('div', class_="spaceit")[0].stripped_strings)[9:] sinopsis = s.find("p").text if(len(emision) > 0): fecha_inicio = datetime.datetime(int(emision[2]), mesANum(emision[0]), int(emision[1].split(',')[0])).strftime("%Y-%m-%d") else: fecha_inicio = None if(len(emision) > 5): fecha_final = datetime.datetime(int(emision[6]), mesANum(emision[4]), int(emision[5].split(',')[0])).strftime("%Y-%m-%d") else: fecha_final = None lista_generos = s.find("td", class_="borderClass").next_element.find_all('span', itemprop="genre") lista = [] for genero in lista_generos: lista.append(genero.text) lista_generos_comas = ",".join(lista) lista_generos_obj = [] for genero in lista_generos: genero_obj, _ = Genero.objects.get_or_create(nombre=genero.text) lista_generos_obj.append(genero_obj) id_u = Anime.objects.all().count() + 1 a = Anime.objects.create(id=id_u, titulo=titulo, imagen=imagen, rango=rango_web, popularidad=popularidad_web, episodios=episodios, sinopsis=sinopsis, fechaInicio=fecha_inicio, fechaFinal=fecha_final) for genero in lista_generos_obj: a.generos.add(genero) writer.add_document(titulo=titulo, imagen=imagen, rango_web=rango_web, popularidad=popularidad_web, fecha_inicio=fecha_inicio, fecha_final=fecha_final, episodios=episodios, sinopsis=sinopsis, generos=lista_generos_comas) except UnicodeEncodeError: continue writer.commit()
class TranslationMemory(object): def __init__(self): self.index = FileStorage(data_dir('memory')).open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer() def __del__(self): self.close() def open_searcher(self): if self.searcher is None: self.searcher = self.index.searcher() def doc_count(self): self.open_searcher() return self.searcher.doc_count() def close(self): if self.searcher is not None: self.searcher.close() self.searcher = None @contextlib.contextmanager def writer(self): writer = self.index.writer() try: yield writer finally: writer.commit() def get_language_code(self, code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code] ) return language.code def import_tmx(self, fileobj, langmap=None): origin = force_text(os.path.basename(fileobj.name)) storage = tmxfile.parsefile(fileobj) header = next( storage.document.getroot().iterchildren( storage.namespaced("header") ) ) source_language_code = header.get('srclang') source_language = self.get_language_code(source_language_code, langmap) languages = {} with self.writer() as writer: for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) translations[lang] = text if lang not in languages: languages[lang] = self.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): writer.add_document( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=CATEGORY_FILE, ) def lookup(self, source_language, target_language, text): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search( text_query, filter=langfilter, limit=20000 ) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield ( match['source'], match['target'], similarity, match['origin'] ) def delete(self, origin): """Delete entries by origin.""" with self.writer() as writer: return writer.delete_by_term('origin', origin) def empty(self): """Recreates translation memory.""" self.index = setup_index() self.searcher = None def get_origins(self): self.open_searcher() return [ force_text(x) for x in self.searcher.lexicon('origin') ]
from whoosh.qparser import QueryParser import okapi need_reload = True index_dir = "data/queryidx" if not os.path.exists(index_dir): os.mkdir(index_dir) index.create_in(index_dir, Schema(qid=ID(stored=True), body=TEXT(stored=True))) need_reload = True ix = FileStorage(index_dir).open_index() if need_reload: ix.writer().commit(mergetype=writing.CLEAR) writer = ix.writer() with open(f"data/msmarco-doctrain-queries.tsv", "rt", encoding="utf8") as f: tsvreader = csv.reader(f, delimiter="\t") for [topicid, querystring_of_topicid] in tsvreader: writer.add_document(qid=topicid, body=querystring_of_topicid) with open(f"data/msmarco-docdev-queries.tsv", "rt", encoding="utf8") as f: tsvreader = csv.reader(f, delimiter="\t") for [topicid, querystring_of_topicid] in tsvreader: writer.add_document(qid=topicid, body=querystring_of_topicid) writer.commit() queryix = FileStorage("data/queryidx").open_index() docix = FileStorage("data/msmarcoidx").open_index()