class NoteSearchService:
    _instance_lock = threading.Lock()

    def __init__(self):
        self.schema = Schema(note_id=NUMERIC(stored=True, unique=True), notebook_id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=ChineseAnalyzer()), snippet=TEXT(analyzer=ChineseAnalyzer()))
        try:
            self.index = FileStorage(config.get("PATH", "notes_index_dir")).open_index()
        except:
            self.index = FileStorage(config.get("PATH", "notes_index_dir")).create_index(self.schema)

    def __new__(cls, *args, **kwargs):
        if not hasattr(NoteSearchService, "_instance"):
            with NoteSearchService._instance_lock:
                if not hasattr(NoteSearchService, "_instance"):
                    NoteSearchService._instance = super(NoteSearchService, cls).__new__(cls, *args, **kwargs)
        return NoteSearchService._instance

    def bulk_update(self, notes):
        writer = self.index.writer(procs=4,multisegment=True)
        for note in notes:
            writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
        writer.commit()

    def add(self, note):
        writer = AsyncWriter(self.index)
        writer.add_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
        writer.commit()

    def update(self, note):
        writer = BufferedWriter(self.index, period=10, limit=10)
        writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
        writer.close()

    def delete(self, note_id):
        writer = BufferedWriter(self.index)
        writer.delete_by_term('note_id', note_id)
        writer.close()

    def search(self, query_string, notebook_id=None):
        with AsyncWriter(self.index).searcher() as searcher:
            query_parser = MultifieldParser(["title", "snippet"], schema=self.index.schema).parse(query_string)
            notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None
            results = searcher.search(query_parser, filter=notebook_filter, limit=None)
            return [res['note_id'] for res in results]
Beispiel #2
0
def populateDB(i):
 
    if i == 0:
        deleteCompleteBD()
        
        
    if not os.path.exists(dirindex):
        os.mkdir(dirindex)
        create_in(dirindex, schema=get_schema())   
        
    ix = FileStorage(dirindex).open_index()
    
    writer = ix.writer()
        
    f = urllib.request.urlopen("https://myanimelist.net/topanime.php?limit=" + str(50*i))
    s = BeautifulSoup(f, "html.parser")
    lista_animes = s.find("table", class_="top-ranking-table").find_all("a", class_="hoverinfo_trigger fl-l ml12 mr8")
    for lista_anime in lista_animes:
        
        ix = FileStorage(dirindex).open_index()
        
        try:
            f = urllib.request.urlopen(lista_anime['href'])
            s = BeautifulSoup(f, "html.parser")            
            
            titulo = s.find("h1", class_="title-name h1_bold_none").text
            
            imagen = s.find("td", class_="borderClass").next_element.next_element.next_element.find('img')['data-src']
        
            rango_web = s.find("div", class_="di-ib ml12 pl20 pt8").contents[0].text.split()[1]
        
            popularidad_web = s.find("div", class_="di-ib ml12 pl20 pt8").contents[1].text.split()[1]
        
            emision = ''.join(s.find("td", class_="borderClass").next_element.find_all('div', class_="spaceit")[1].stripped_strings)[6:].split()
            
            episodios = ''.join(s.find("td", class_="borderClass").next_element.find_all('div', class_="spaceit")[0].stripped_strings)[9:]
            
            sinopsis = s.find("p").text
            
            if(len(emision) > 0):
                fecha_inicio = datetime.datetime(int(emision[2]), mesANum(emision[0]), int(emision[1].split(',')[0])).strftime("%Y-%m-%d")
            else:
                fecha_inicio = None
            if(len(emision) > 5):
                fecha_final = datetime.datetime(int(emision[6]), mesANum(emision[4]), int(emision[5].split(',')[0])).strftime("%Y-%m-%d")
            else:
                fecha_final = None
            
            lista_generos = s.find("td", class_="borderClass").next_element.find_all('span', itemprop="genre")
            
            lista = []
            for genero in lista_generos:
                lista.append(genero.text)
            lista_generos_comas = ",".join(lista)
            
            lista_generos_obj = []
            
            for genero in lista_generos:
                genero_obj, _ = Genero.objects.get_or_create(nombre=genero.text)
                lista_generos_obj.append(genero_obj)       
            
            id_u = Anime.objects.all().count() + 1     
            a = Anime.objects.create(id=id_u, titulo=titulo, imagen=imagen, rango=rango_web, popularidad=popularidad_web, episodios=episodios, sinopsis=sinopsis, fechaInicio=fecha_inicio, fechaFinal=fecha_final)
            
            for genero in lista_generos_obj:
                a.generos.add(genero)
            
            writer.add_document(titulo=titulo, imagen=imagen, rango_web=rango_web, popularidad=popularidad_web, fecha_inicio=fecha_inicio, fecha_final=fecha_final, episodios=episodios, sinopsis=sinopsis, generos=lista_generos_comas)
               
        except UnicodeEncodeError:
            continue
        
    writer.commit()
Beispiel #3
0
class TranslationMemory(object):
    def __init__(self):
        self.index = FileStorage(data_dir('memory')).open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
        )
        self.searcher = None
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    def open_searcher(self):
        if self.searcher is None:
            self.searcher = self.index.searcher()

    def doc_count(self):
        self.open_searcher()
        return self.searcher.doc_count()

    def close(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None

    @contextlib.contextmanager
    def writer(self):
        writer = self.index.writer()
        try:
            yield writer
        finally:
            writer.commit()

    def get_language_code(self, code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code]
            )
        return language.code

    def import_tmx(self, fileobj, langmap=None):
        origin = force_text(os.path.basename(fileobj.name))
        storage = tmxfile.parsefile(fileobj)
        header = next(
            storage.document.getroot().iterchildren(
                storage.namespaced("header")
            )
        )
        source_language_code = header.get('srclang')
        source_language = self.get_language_code(source_language_code, langmap)

        languages = {}
        with self.writer() as writer:
            for unit in storage.units:
                # Parse translations (translate-toolkit does not care about
                # languages here, it just picks first and second XML elements)
                translations = {}
                for node in unit.getlanguageNodes():
                    lang, text = get_node_data(unit, node)
                    translations[lang] = text
                    if lang not in languages:
                        languages[lang] = self.get_language_code(lang, langmap)

                try:
                    source = translations.pop(source_language_code)
                except KeyError:
                    # Skip if source language is not present
                    continue

                for lang, text in translations.items():
                    writer.add_document(
                        source_language=source_language,
                        target_language=languages[lang],
                        source=source,
                        target=text,
                        origin=origin,
                        category=CATEGORY_FILE,
                    )

    def lookup(self, source_language, target_language, text):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
        ])
        self.open_searcher()
        text_query = self.parser.parse(text)
        matches = self.searcher.search(
            text_query, filter=langfilter, limit=20000
        )

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (
                match['source'], match['target'], similarity, match['origin']
            )

    def delete(self, origin):
        """Delete entries by origin."""
        with self.writer() as writer:
            return writer.delete_by_term('origin', origin)

    def empty(self):
        """Recreates translation memory."""
        self.index = setup_index()
        self.searcher = None

    def get_origins(self):
        self.open_searcher()
        return [
            force_text(x) for x in self.searcher.lexicon('origin')
        ]
Beispiel #4
0
from whoosh.qparser import QueryParser

import okapi

need_reload = True

index_dir = "data/queryidx"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    index.create_in(index_dir,
                    Schema(qid=ID(stored=True), body=TEXT(stored=True)))
    need_reload = True

ix = FileStorage(index_dir).open_index()
if need_reload:
    ix.writer().commit(mergetype=writing.CLEAR)
    writer = ix.writer()
    with open(f"data/msmarco-doctrain-queries.tsv", "rt",
              encoding="utf8") as f:
        tsvreader = csv.reader(f, delimiter="\t")
        for [topicid, querystring_of_topicid] in tsvreader:
            writer.add_document(qid=topicid, body=querystring_of_topicid)
    with open(f"data/msmarco-docdev-queries.tsv", "rt", encoding="utf8") as f:
        tsvreader = csv.reader(f, delimiter="\t")
        for [topicid, querystring_of_topicid] in tsvreader:
            writer.add_document(qid=topicid, body=querystring_of_topicid)
    writer.commit()

queryix = FileStorage("data/queryidx").open_index()
docix = FileStorage("data/msmarcoidx").open_index()