Exemple #1
0
class IndexSchema(SchemaClass):
    filename = TEXT(stored=True, analyzer=simple_ana)
    symbol = TEXT(stored=True, analyzer=custom_ana)
    module = TEXT(stored=True, analyzer=simple_ana)
    location = STORED()
    kind = STORED()
    sort = NUMERIC(sortable=True)
class BookmarkSchema(fields.SchemaClass):
    contentNGram = TEXT(stored=False, analyzer=_N_GRAM_ANALYZER, phrase=False)
    contentText = TEXT(stored=False, analyzer=_TEXT_ANALYZER, phrase=True)
    urlSize = NUMERIC(signed=False, sortable=True, default=999)
    name = STORED()
    path = STORED()
    profile = STORED()
    url = STORED()
    icon = STORED()
 def __init__(self) -> None:
     self.schema = Schema(
         id=NUMERIC(unique=True, stored=True),
         canonical_name=STORED(),
         name=STORED(),
         name_tokenized=TEXT(stored=False,
                             analyzer=WhooshConstants.tokenized_analyzer),
         name_stemmed=TEXT(stored=False,
                           analyzer=WhooshConstants.stem_analyzer),
         name_normalized=TEXT(stored=False,
                              analyzer=WhooshConstants.normalized_analyzer,
                              field_boost=100.0))
Exemple #4
0
def populateWhooshNoticias():

    schemNoticias = Schema(idNoticia=NUMERIC(stored=True),
                           nombreEquipo=TEXT(stored=True),
                           linkNoticia=TEXT(stored=True),
                           tituloNoticia=TEXT(stored=True),
                           descripcionNoticia=TEXT(stored=True),
                           imagenNoticia=STORED(),
                           tiempoPublicacion=TEXT(stored=True),
                           autor=TEXT(stored=True))

    if os.path.exists("IndexNoticias"):
        shutil.rmtree("IndexNoticias")
    os.mkdir("IndexNoticias")

    ixNoticia = create_in("IndexNoticias", schema=schemNoticias)
    writerNoticia = ixNoticia.writer()
    listaNoticias = extraerNoticias()
    n = 1
    for noticias in listaNoticias:
        for a in noticias:
            for noticia in a:
                writerNoticia.add_document(idNoticia=n,
                                           nombreEquipo=noticia[0],
                                           linkNoticia=noticia[1],
                                           tituloNoticia=noticia[2],
                                           descripcionNoticia=noticia[3],
                                           imagenNoticia=noticia[4],
                                           tiempoPublicacion=noticia[5],
                                           autor=noticia[6])
                n += 1
    writerNoticia.commit()

    return n
    def __init__(self, search_term: str):

        self.schema = Schema(
            educational_requirements=TEXT(),
            employment_type=ID(),
            experience_requirements=TEXT(),
            industry=KEYWORD(),
            organization=ID(stored=True),
            title=TEXT(stored=True),
            url=STORED(),
            parent_identifier=NUMERIC(stored=True),

            # Paragraph Data Children
            type=ID(stored=True),
            parent=NUMERIC(),
            paragraph_number=NUMERIC(stored=True),
            paragraph_heading=TEXT(analyzer=Analyzing.ImprovedTokenizer(),
                                   stored=True),
            paragraph_content=TEXT(analyzer=Analyzing.ImprovedTokenizer(),
                                   stored=True))

        self.index_path: str = os.path.join(definitions.MAIN_PATH, "Storage",
                                            "Indexe", search_term)
        FileHandler.if_folder_not_existent_create(self.index_path)

        self.ix: Index = None
        self.writer: IndexWriter = None
Exemple #6
0
def all_stops(api):
    """Generate a pickle of all stops."""

    log = logging.getLogger(__name__)
    logging.basicConfig(level=logging.DEBUG)

    rtdicts = api.routes()[
        'route']  # All active routes on the realtime system.
    stopset = set()
    allstops = {}

    # Whoosh index
    schema = Schema(sid=TEXT(stored=True),
                    name=TEXT(stored=True),
                    location=STORED())
    indexname = "stop_index"
    if not os.path.exists(indexname):
        os.mkdir(indexname)
        ix = index.create_in(indexname, schema)
    else:
        ix = index.open_dir(indexname)
    writer = ix.writer()

    log.debug("Generating stop database.")

    # Loop through all the routes to get at stops (API has weird structure)
    for rtdict in rtdicts:
        if rtdict['rt'] not in allstops:
            rtobject = Route.fromapi(api, rtdict)
            # Add all stops on the route to the set
            for s in rtobject.inbound_stops + rtobject.outbound_stops:
                stop = (s.id, s.location, s.name)
                stopset.add(stop)

    nchanges = 0
    log.debug("Generating search index.")
    for stop in stopset:
        nchanges += 1
        allstops[stop[0]] = stop

    # Switch to display groupings
    allstops = group_stops(allstops)

    for stop in allstops.values():
        writer.update_document(sid=unicode(stop[0]),
                               name=stop[2],
                               location=stop[1])
    writer.commit()

    # And create pickle too
    log.debug("Pickling db...")
    export = dict(allstops)
    with open("paac.stops.pickle", "w") as f:
        pickle.dump(allstops, f)

    # And create app db
    log.debug("Creating app database...")
    # create_app_db(allstops, already_grouped=True)

    return nchanges
Exemple #7
0
class TMSchema(SchemaClass):
    """Fultext index schema for source and context strings."""
    source_language = ID(stored=True)
    target_language = ID(stored=True)
    source = TEXT(stored=True)
    target = STORED()
    origin = ID(stored=True)
    category = NUMERIC(stored=True)
Exemple #8
0
    def build_index(self, path, names=None):
        print("Building index..")
        if not os.path.exists(path):
            os.makedirs(path)

        schema = Schema(title=TEXT(analyzer=self._analyzer), pid=STORED())
        titles = shared.graph.playlist_titles if names is None else names
        normalized_titles = [normalize_title(title) for title in titles]
        ix = index.create_in(path, schema)
        writer = ix.writer()
        for i in trange(len(normalized_titles)):
            title = normalized_titles[i]
            writer.add_document(title=title, pid=i)
        print("Committing..")
        writer.commit()
        print("Done.")
        self._ix = ix
Exemple #9
0
def create_index(index_dir):
    schema = Schema(book_abbr=STORED(),
                    book_name=STORED(),
                    book_tree=STORED(),
                    book_kindle=STORED(),
                    short=STORED(),
                    long=STORED(),
                    key_terms=STORED(),
                    key_terms_content=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, STOP_WORDS) | CharsetFilter(accent_map)),
                    book=ID(stored=True),
                    heading=TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)),
                    session=TEXT(stored=True, analyzer=StandardAnalyzer(minsize=1, stoplist=None)),
                    date=DATETIME(stored=True, sortable=True),
                    exact=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)),
                    stemmed=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re) | CharsetFilter(accent_map)),
                    common=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)),
                    )

    ix = index.create_in(index_dir, schema)

    writer = ix.writer()
    for book in Books.indexed:
        with open("books/{}.txt".format(book['abbr']), encoding='utf-8') as f:
            text = pre_process_book(book, f.read())
        text = re.search(book['book_re'], text, flags=re.DOTALL).group(1)

        d = {
            'book_name': book['name'],
            'book_abbr': book['abbr'],
            'book_tree': book['tree'],
            'book_kindle': book['kindle'],
            'book': book['abbr'].lower(),
        }

        i = 0
        heading_tiers = [{'short': '', 'long': ''}] * 3
        carry_over_heading = None
        headings = list(filter(None, book['headings_re'].split(text)[1:]))
        for (__heading, _content) in zip(headings[::2], headings[1::2]):
            content = __heading + _content
            if carry_over_heading:
                content = carry_over_heading + content
                carry_over_heading = None

            heading = clean_heading(__heading)
            if 'heading_replacements' in book:
                for (pattern, repl) in book['heading_replacements']:
                    heading = pattern.sub(repl, heading, 1)

            update_heading_tiers(book, heading_tiers, heading)

            has_content = re.search(r'[a-z]', _content)
            if not has_content:
                carry_over_heading = content
                continue

            add_document(writer, d, heading_tiers, content)
            i += 1
        print(i)

    writer.commit()
    return ix
Exemple #10
0
#! /usr/bin/env python

from whoosh.fields import Schema, ID, KEYWORD, TEXT, STORED
import os
import config
import sys
from whoosh import writing, index

if len(sys.argv) > 1 and sys.argv[1] == "reindex":
    writer = index.open_dir(config.INDEX_DIR).writer()
    writer.commit(mergetype=writing.CLEAR)
else:
    schema = Schema(id=ID(stored=True, unique=True),
                    title=TEXT(stored=True, sortable=True),
                    content=TEXT(stored=True),
                    language=STORED(),
                    tag=KEYWORD(stored=True, commas=True))

    if not os.path.exists(config.INDEX_DIR):
        os.mkdir(config.INDEX_DIR)

    index.create_in(config.INDEX_DIR, schema)
    print "Index initialized"
Exemple #11
0
            t.boost = 1.0
            if positions:
                t.pos = start_pos + value.find(word)
            if chars:
                t.startchar = start_char + value.find(word)
                t.endchar = t.startchar + len(word)
            yield t


schema = Schema(
    path=ID(stored=True),
    family=ID(stored=True),
    name=ID(stored=True),
    description=TEXT(stored=True, analyzer=ChineseTokenizer()),
    keywords=KEYWORD(stored=True, commas=True),
    created_at=STORED(),
    updated_at=STORED(),
)


class WhooshSearch(object):
    def __init__(self, app=None):
        if app is not None:
            self.init_app(app)

    def init_app(self, app):
        app.config.setdefault('WHOOSH_DIR', 'data')
        self.app = app
        app.extensions = getattr(app, 'extensions', {})
        app.extensions['elasticsearch'] = self
Exemple #12
0
log = logging.getLogger(__name__)

# CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

#INDEX SCHEMA DEFINITION
SCHEMA = Schema(fileid=ID(unique=True),
                owner=TEXT(),
                repository=TEXT(stored=True),
                path=TEXT(stored=True),
                content=FieldType(format=Characters(),
                                  analyzer=ANALYZER,
                                  scorable=True,
                                  stored=True),
                modtime=STORED(),
                extension=TEXT(stored=True))

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(),
    repository=ID(unique=True, stored=True),
    author=TEXT(stored=True),
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
Exemple #13
0
from whoosh.fields import Schema, TEXT, KEYWORD, STORED

#
# Schema used to index the database. The original tuple (minus the zip codes) is stored in the data attribute.
#
SCHEMA = Schema(name=TEXT(stored=False),
                zips=KEYWORD(stored=False),
                data=STORED())
Exemple #14
0
from whoosh.qparser import QueryParser

INDEX_PATH = 'index'


@dataclass
class Verse:
    reference: str
    text: str

    def asdict(self):
        return {'reference': self.reference, 'text': self.text}


_verse_schema = Schema(
    reference=STORED(),
    text=TEXT(analyzer=StemmingAnalyzer(), stored=True),
)

if index.exists_in(INDEX_PATH):
    _index = index.open_dir(INDEX_PATH)
else:
    os.mkdir(INDEX_PATH)
    _index = index.create_in(INDEX_PATH, _verse_schema)


def add_verses(verses):
    writer = _index.writer()

    for verse in verses:
        writer.add_document(reference=verse.reference, text=verse.text)
Exemple #15
0
            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)


if __name__ == '__main__':
    print('-' * 30)
    print('Muzeeglot data ingestion')
    print('-' * 30)
    if exists(configuration.INGESTION_LOCK):
        print('WARN: ingestion lock detected, pass')
    else:
        print('INFO: evaluate tags corpus')
        tags_corpus = get_tags_corpus()
        print('INFO: create search index')
        if not exists(configuration.INDEX):
            makedirs(configuration.INDEX)
        schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED())
        index = create_in(configuration.INDEX, schema)
        writer = BufferedWriter(index, period=60, limit=200)
        ingest_languages(writer)
        ingest_tags(tags_corpus)
        ingest_entities(tags_corpus, writer)
        print('INFO: optimize and close index')
        writer.close()
        index.optimize()
        index.close()
        print('INFO: write ingestion lock')
        with open(configuration.INGESTION_LOCK, 'w') as stream:
            stream.write('ingested')
Exemple #16
0
def populateWhooshPeliculas():
    
    schemPeliculas = Schema(idPelicula=NUMERIC(stored=True), titulo=TEXT(stored=True), portada= STORED(), sinopsis= TEXT(stored=True), linkPelicula= TEXT(stored=True), duracion=TEXT(stored=True), actores = KEYWORD(stored=True, commas=True), personal = KEYWORD(stored=True, commas=True), genero = KEYWORD(stored=True, commas=True))

    if os.path.exists("Index"):
        shutil.rmtree("Index")
    os.mkdir("Index")
    

    ix = create_in("Index", schema=schemPeliculas)
    writer = ix.writer()
    listaPeliculas = extraer_datos_peliculas()
    numPeliculas=1
    for pelicula in listaPeliculas:
        writer.update_document(idPelicula =numPeliculas, titulo=pelicula[0], portada=pelicula[1], sinopsis=pelicula[2], linkPelicula=pelicula[3], duracion=pelicula[4], actores = soloNombres(pelicula[5]), personal = soloNombres(pelicula[6]), genero=pelicula[7])
        numPeliculas+=1
    writer.commit()
    
    return numPeliculas-1
Exemple #17
0
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

# FILE INDEX SCHEMA DEFINITION
FILE_INDEX_NAME = 'FILE_INDEX'
FILE_SCHEMA = Schema(
    fileid=ID(unique=True),  # Path
    repository=ID(stored=True),
    repository_id=NUMERIC(unique=True, stored=True),  # Numeric id of repo
    repo_name=TEXT(stored=True),
    owner=TEXT(),
    path=TEXT(stored=True),
    content=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    modtime=STORED(),
    md5=STORED(),
    extension=ID(stored=True),
    commit_id=TEXT(stored=True),
    size=NUMERIC(stored=True),
    mimetype=TEXT(stored=True),
    lines=NUMERIC(stored=True),
)

# COMMIT INDEX SCHEMA
COMMIT_INDEX_NAME = 'COMMIT_INDEX'
COMMIT_SCHEMA = Schema(
    commit_id=ID(unique=True, stored=True),
    repository=ID(unique=True, stored=True),
    repository_id=NUMERIC(unique=True, stored=True),
    commit_idx=NUMERIC(stored=True, sortable=True),
Exemple #18
0
import unicodecsv as csv
from whoosh import index, sorting
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC
from whoosh.qparser import MultifieldParser

_schema = Schema(
    ror=STORED(),
    grid=STORED(),
    name=NGRAMWORDS(stored=False),
    aliases=NGRAMWORDS(stored=False),
    num_students=NUMERIC(int, sortable=True, stored=False),
    citation_score=NUMERIC(int, sortable=True, stored=False),
)

_index_path = 'data/ror-whoosh-index'


def _read_ror_csv_rows():
    rows = []
    with open('data/ror-metrics.csv') as ror_csv:
        reader = csv.DictReader(ror_csv)
        for row in reader:
            row['aliases'] = row['aliases'].split(
                u'###') if row['aliases'] else []
            row['num_students'] = int(
                row['num_students']) if row['num_students'] else None
            row['citation_score'] = float(
                row['citation_score']) if row['citation_score'] else None
            rows.append(row)
Exemple #19
0
jieba.dt.tmp_dir = os.path.dirname(os.path.abspath(__file__))

# 使用结巴中文分词
analyzer = ChineseAnalyzer()

# 创建schema, 在此处有定义的说明可以被用来检索,但STORED不可被检索,只是\
#    表面会出现在搜索的结果中
# stored=True表示可以出现在检索结果中,如果content内容较多\
#    为避免占用空间过大,可以定义为False,然后通过ID定位到数据库\
#    中,通过数据库获取
schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer, vector=True, phrase=True),
    path=ID(stored=True),
    # 可被检索,但content的内容不出现在检索的结果中
    content=TEXT(stored=False, analyzer=analyzer),
    id=STORED()
    )

# 存储schema信息到'indexdir'目录下
indexdir = 'indexdir/'
if not os.path.exists(indexdir):
    os.mkdir(indexdir)
ix = create_in(indexdir, schema)

# 安照schema定义信息,增加需要建立索引的文档
# 注意:字符串格式需要为Unicode格式
writer = ix.writer()
writer.add_document(title=u'第一篇文档', path=u'www.baidu.com', id=u'1',
                    content=u'这是我们增加的第一篇文脏,又名文档')
writer.add_document(title=u'第二篇文档', path=u'www.google.com', id=u'2',
                    content=u'这是我们增加的第二篇文档, very interesting')