Exemple #1
0
def search(request):
    raw_query = request.GET['q']

    from whoosh.index import create_in, open_dir
    from whoosh.fields import ID, DATETIME, TEXT, Schema
    from whoosh.qparser import QueryParser

    schema = Schema(
        id=ID(stored=True),
        type=ID(stored=True),
        creator_id=ID(stored=True),
        timestamp=DATETIME(),
        # TODO what about stuff with multiple contents
        # TODO what about pastebin which should really use a source-code analyzer
        content=TEXT(),
    )

    #ix = create_in('data/whoosh-index/', schema)
    #writer = ix.writer()
    # TODO cannot guarantee this dir exists
    ix = open_dir(request.registry.settings['spline.search.whoosh.path'])

    from sqlalchemy import create_engine
    from spline.models import session
    #session.bind = create_engine('postgresql:///spline?host=/nail/home/amunroe/var/run')

    from spline_pastebin.models import Paste

    query_parser = QueryParser('content', schema=schema)
    whoosh_query = query_parser.parse(raw_query)

    with ix.searcher() as searcher:
        results = searcher.search(whoosh_query, limit=10)
        num_results = len(results)
        results = [repr(res) for res in results]

    return dict(
        whoosh_results=results,
        whoosh_results_count=num_results,
    )
Exemple #2
0
 def __init__(self, mode="normal"):
     self.db_handler = DbHandler()
     self.index_path = "index"
     self.ix = None
     self.writer = None
     self.spell = Spell.get_instance()
     """
     By default, the StandardAnalyzer() is used. This analyzer is composed of a RegexTokenizer with a LowercaseFilter
     and an optional StopFilter (for removing stopwords)
     """
     self.analyzer = self.__determine_analyzer(mode)
     """
     The whoosh.fields.TEXT indexes the text and stores the term positions to allow phrase searching
     TEXT fields use StandardAnalyzer by default. 
     To specify a different analyzer, use the analyzer keyword argument to the constructor, 
     e.g. TEXT(analyzer=analysis.StemmingAnalyzer())
     """
     # Read the Vectors section in http://whoosh.readthedocs.io/en/latest/schema.html
     self.schema = Schema(
         doc_id=ID(stored=True),
         title=TEXT(stored=True),
         authors=TEXT(stored=True),
         pub_date=DATETIME(stored=True),
         abstract=TEXT(stored=True),
         content=TEXT(vector=True, analyzer=self.analyzer),
         pdf_name=STORED,
     )
     """
     To test whether a directory currently contains a valid index, use index.exists_in:
     """
     exists = exists_in(self.index_path)
     if exists:
         print("Index already exists")
         # A valid index exists, reload the index
         self.__reload_index()
     else:
         print("Index does not yet exist")
         # No valid index found, remove and recreate index
         rmtree(self.index_path, ignore_errors=True)
         self.__create_index()
def video_rebuild():
    print datetime.datetime.now()
    print 'video_rebuild'
    video_db = mysql_new.BaseDB(config.MYSQL_DEFINE_VIDEO)
    schema = Schema(movieid=ID(stored=True, unique=True),
                    gcid=ID(stored=True),
                    title=TEXT(stored=True, analyzer=analyzer_zhongwen),
                    pinyin_title=TEXT(stored=True, analyzer=analyzer_pinyin),
                    pic=ID(stored=True),
                    cover_width=STORED,
                    cover_height=STORED,
                    uid=ID(stored=True),
                    upline_time=DATETIME(stored=True, sortable=True),
                    duration=STORED)

    SQL = '''SELECT video_id as `movieid`,`duration`,`upline_time`, `title`, `uid`, `pic`,`gcid`, `poster_width` as cover_width, `poster_height` as cover_height FROM `short_media_info_v2` WHERE `status` in (1,2)
          '''
    res = video_db.query(SQL, ())
    if not res:
        return
    index_path = os.path.join(config.index_root_dir, 'video')
    if not os.path.exists(index_path):
        os.mkdir(index_path)
    ix = create_in(index_path, schema=schema)
    storage = FileStorage(index_path)
    ix = storage.open_index()
    writer = ix.writer()
    for info in res:
        pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8')))
        writer.add_document(movieid=str(info.get('movieid')).decode('utf8'),
                            gcid=str(info.get('gcid')).decode('utf8'),
                            title=info.get('title').decode('utf8'),
                            pinyin_title=pinyin_title,
                            uid=str(info.get('uid')).decode('utf8'),
                            pic=info.get('pic').decode('utf8'),
                            cover_width=info.get('cover_width'),
                            cover_height=info.get('cover_height'),
                            duration=info.get('duration'),
                            upline_time=info.get('upline_time'))
    writer.commit(mergetype=writing.CLEAR)
Exemple #4
0
def almacenar_datos():

    # define el esquema de la información
    schem = Schema(nombre=TEXT(stored=True),
                   edad=NUMERIC(stored=True),
                   altura=NUMERIC(stored=True),
                   nacionalidad=KEYWORD(stored=True, commas=True),
                   pie=TEXT(stored=True),
                   posicion_principal=KEYWORD(stored=True, commas=True),
                   posicion_secundaria=KEYWORD(stored=True, commas=True),
                   valor=NUMERIC(stored=True),
                   equipo=TEXT(stored=True),
                   contrato=DATETIME(stored=True))

    # eliminamos el directorio del índice, si existe
    if os.path.exists("Index"):
        shutil.rmtree("Index")
    os.mkdir("Index")

    # creamos el índice
    ix = create_in("Index", schema=schem)
    # creamos un writer para poder añadir documentos al indice
    writer = ix.writer()
    i = 0
    lista = almacenar_datos_bs()
    for jugador in lista:
        # añade cada pelicula de la lista al índice
        writer.add_document(nombre=str(jugador[0]),
                            edad=jugador[1],
                            altura=float(jugador[2]),
                            nacionalidad=str(jugador[3]),
                            pie=str(jugador[4]),
                            posicion_principal=str(jugador[5]),
                            posicion_secundaria=str(jugador[6]),
                            valor=float(jugador[7]),
                            equipo=str(jugador[8]),
                            contrato=jugador[9])
        i += 1
    writer.commit()
    print("Se han indexado " + str(i) + " jugadores")
Exemple #5
0
def init_extensions(app):
    whoosh_searcher.init_app(app)
    configure_uploads(app, upload_photos)
    mail.init_app(app)
    admin.init_app(app)
    mongo.init_app(app, "MONGO")
    oauth.init_app(app)
    login_manager.init_app(app)
    if app.config.get('USE_CACHE', False):
        cache.init_app(app, {})

    with app.app_context():
        # 添加flask-admin视图
        admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理'))
        admin.add_view(
            admin_view.CatalogsModelView(mongo.db['catalogs'], '栏目管理'))
        admin.add_view(admin_view.PostsModelView(mongo.db['posts'], '帖子管理'))
        admin.add_view(
            admin_view.PassagewaysModelView(mongo.db['passageways'], '温馨通道'))
        admin.add_view(
            admin_view.FriendLinksModelView(mongo.db['friend_links'], '友链管理'))
        admin.add_view(admin_view.PagesModelView(mongo.db['pages'], '页面管理'))
        admin.add_view(
            admin_view.FooterLinksModelView(mongo.db['footer_links'], '底部链接'))
        admin.add_view(admin_view.AdsModelView(mongo.db['ads'], '广告管理'))
        admin.add_view(admin_view.OptionsModelView(mongo.db['options'],
                                                   '系统设置'))

        # 初始化Whoosh索引
        chinese_analyzer = ChineseAnalyzer()
        post_schema = Schema(obj_id=ID(unique=True, stored=True),
                             title=TEXT(stored=True,
                                        analyzer=chinese_analyzer),
                             content=TEXT(stored=True,
                                          analyzer=chinese_analyzer),
                             create_at=DATETIME(stored=True),
                             catalog_id=ID(stored=True),
                             user_id=ID(stored=True))
        whoosh_searcher.add_index('posts', post_schema)
Exemple #6
0
    def fields_map(self, field_type):
        if field_type == "primary":
            return ID(stored=True, unique=True)
        type_map = {
            'date': types.Date,
            'datetime': types.DateTime,
            'boolean': types.Boolean,
            'integer': types.Integer,
            'float': types.Float
        }
        if isinstance(field_type, str):
            field_type = type_map.get(field_type, types.Text)

        if field_type in (types.DateTime, types.Date):
            return DATETIME(stored=True, sortable=True)
        elif field_type == types.Integer:
            return NUMERIC(stored=True, numtype=int)
        elif field_type == types.Float:
            return NUMERIC(stored=True, numtype=float)
        elif field_type == types.Boolean:
            return BOOLEAN(stored=True)
        return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
Exemple #7
0
class Schema(SchemaClass):

    #: The id of the job.
    id = ID(stored=True, unique=True)

    #: The title of the job.
    title = TEXT(analyzer=stemming_analyzer)

    #: The name of the company.
    company = TEXT(analyzer=stemming_analyzer)

    #: Location as a comma-separated string of city and country.
    location = KEYWORD(lowercase=True, scorable=True, commas=True)

    #: The type of job.
    job_type = TEXT(analyzer=stemming_analyzer)

    #: The job tags as a comma-separated string of tag slugs.
    tags = KEYWORD(lowercase=True, scorable=True, commas=True)

    #: When was this job created?
    created = DATETIME(sortable=True)
Exemple #8
0
    def handle(self, *args, **kwargs):
        """ Creates the index iterating over all the pages of the site """
        schema = Schema(pk=NUMERIC(unique=True, stored=True),
                        title=TEXT,
                        summary=TEXT,
                        tags=KEYWORD(commas=True, scorable=True),
                        pub_date=DATETIME(sortable=True))

        if not os.path.exists(settings.INDEX):
            os.mkdir(settings.INDEX)

        ix = create_in(settings.INDEX, schema)
        writer = ix.writer()
        objects = Page.objects.all()
        for object in objects:
            tags = map(lambda x: x.title, object.tags.all())
            writer.add_document(title=object.title,
                                summary=object.summary,
                                tags=",".join(tags),
                                pk=object.pk,
                                pub_date=object.pub_date)
        writer.commit()
Exemple #9
0
def get_schema():
    analyzer = StemmingAnalyzer(stoplist=STOP)
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    url=ID(stored=True),
                    content_length=NUMERIC(stored=True, sortable=True),
                    thread_votecount=NUMERIC(stored=True, sortable=True),
                    vote_count=NUMERIC(stored=True, sortable=True),
                    content=TEXT(stored=True, analyzer=analyzer,
                                 sortable=True),
                    tags=KEYWORD(stored=True, commas=True),
                    is_toplevel=BOOLEAN(stored=True),
                    lastedit_date=DATETIME(stored=True, sortable=True),
                    rank=NUMERIC(stored=True, sortable=True),
                    author=TEXT(stored=True),
                    author_score=NUMERIC(stored=True, sortable=True),
                    author_handle=TEXT(stored=True),
                    author_email=TEXT(stored=True),
                    author_uid=ID(stored=True),
                    author_url=ID(stored=True),
                    uid=ID(stored=True),
                    type=NUMERIC(stored=True, sortable=True),
                    type_display=TEXT(stored=True))
    return schema
Exemple #10
0
 def build_schema(self, fields):
     schema_fields = {
         'id': ID(stored=True, unique=True),
         'django_ct': ID(stored=True),
         'django_id': ID(stored=True),
     }
     # Grab the number of keys that are hard-coded into Haystack.
     # We'll use this to (possibly) fail slightly more gracefully later.
     initial_key_count = len(schema_fields)
     content_field_name = ''
     
     for field_name, field_class in fields.items():
         if field_class.is_multivalued:
             if field_class.indexed is False:
                 schema_fields[field_class.index_fieldname] = IDLIST(stored=True)
             else:
                 schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True)
         elif field_class.field_type in ['date', 'datetime']:
             schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
         elif field_class.field_type == 'integer':
             schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int)
         elif field_class.field_type == 'float':
             schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float)
         elif field_class.field_type == 'boolean':
             schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
         else:
             schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer())
         
         if field_class.document is True:
             content_field_name = field_class.index_fieldname
     
     # Fail more gracefully than relying on the backend to die if no fields
     # are found.
     if len(schema_fields) <= initial_key_count:
         raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
     
     return (content_field_name, Schema(**schema_fields))
Exemple #11
0
# collection index only has one element
news = db['news']

# create directory for index if not exists
if not os.path.exists("news"):
    os.makedirs("news")

# do query for new news
news_index = None
if not exists_in("news"):
    schema = Schema(id=ID(unique=True, stored=True),
                    d=TEXT(spelling=True, stored=True),
                    t=TEXT(spelling=True, stored=True),
                    tags=TEXT(stored=True),
                    time=DATETIME(stored=True),
                    link=TEXT(stored=True))
    news_index = create_in("news", schema)
    result = news.find()
else:
    news_index = open_dir("news")
    result = news.find({"_id": {"$gt": ObjectId(last_id["last_id"])}})

news_writer = news_index.writer()

# do query for new entities
entities_index = open_dir("entities")

# index each entry here
last_post = None
i = 0
Exemple #12
0
import os.path
from datetime import datetime
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, DATETIME

schema = Schema(title=TEXT(stored=True),
                url=TEXT(stored=True),
                date=DATETIME(stored=True),
                content=TEXT,
                hash=ID(stored=True, unique=True))

if not os.path.exists("index"):
    os.mkdir("index")

ix = create_in("index", schema)

import re
import requests
from bs4 import BeautifulSoup as bs4
with open('Safari Bookmarks.html') as f:
    r = f.read()

h = bs4(r, "lxml")
links = h.find_all('a', href=re.compile('^http'))
print(len(links))

writer = ix.writer()
for i, link in enumerate(links):
    try:
        url = link.get('href')
        print(i, url)
def get_schema():
    return Schema(titulo=TEXT(stored=True),
                  fecha_inicio=DATETIME(stored=True),
                  fecha_fin=DATETIME(stored=True),
                  descripcion=TEXT(stored=True),
                  categorias=KEYWORD(stored=True, commas=True, scorable=True))
Exemple #14
0
def get_schema():
    return Schema(titulo=TEXT(stored=True),tituloOriginal=TEXT(stored=True),fechaEstreno=DATETIME(stored=True),
                  director=TEXT(stored=True),reparto=TEXT,sinopsis=TEXT)
Exemple #15
0
def init_extensions(app):
    """
    初始化插件
    :param app:
    :return:
    """
    global use_cache
    whoosh_searcher.init_app(app)
    configure_uploads(app, upload_img)
    mail.init_app(app)
    admin.init_app(app)
    mongo.init_app(app, 'MONGO')
    oauth.init_app(app)
    login_manager.init_app(app)
    # use_cache = app.config.get('USE_CHCHE', False)
    # if use_cache:
    #     cache.init_app(app)

    with app.app_context():
        # 添加flask-admin视图
        admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理'))
        admin.add_view(admin_view.RolesModelView(mongo.db['roles'], '角色管理'))
        admin.add_view(
            admin_view.CatalogsModelView(mongo.db['catalogs'],
                                         '栏目管理',
                                         category='内容管理'))
        admin.add_view(
            admin_view.PostsModelView(mongo.db['posts'],
                                      '帖子管理',
                                      category='内容管理'))
        admin.add_view(
            admin_view.PassagewaysModelView(mongo.db['passageways'],
                                            '温馨通道',
                                            category='推广管理'))
        admin.add_view(
            admin_view.FriendLinksModelView(mongo.db['friend_links'],
                                            '友链管理',
                                            category='推广管理'))
        admin.add_view(
            admin_view.PagesModelView(mongo.db['pages'],
                                      '页面管理',
                                      category='推广管理'))
        admin.add_view(
            admin_view.FooterLinksModelView(mongo.db['footer_links'],
                                            '底部链接',
                                            category='推广管理'))
        admin.add_view(
            admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理'))
        admin.add_view(admin_view.OptionsModelView(mongo.db['options'],
                                                   '系统管理'))

        # 初始化whoosh索引
        chinese_analyzer = ChineseAnalyzer()
        post_schema = Schema(obj_id=ID(unique=True, sortable=True),
                             title=TEXT(sortable=True,
                                        analyzer=chinese_analyzer),
                             content=TEXT(sortable=True,
                                          analyzer=chinese_analyzer),
                             create_at=DATETIME(sortable=True),
                             catalog_id=ID(sortable=True),
                             user_id=ID(sortable=True))
        whoosh_searcher.add_index('posts', post_schema)
Exemple #16
0
    message=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    parents=TEXT(),
    added=TEXT(),
    removed=TEXT(),
    changed=TEXT(),
)

CHGSET_IDX_NAME = 'CHGSET_INDEX'

# used only to generate queries in journal
JOURNAL_SCHEMA = Schema(
    username=TEXT(),
    date=DATETIME(),
    action=TEXT(),
    repository=TEXT(),
    ip=TEXT(),
)


class WhooshResultWrapper(object):
    def __init__(self, search_type, searcher, matcher, highlight_items,
                 repo_location):
        self.search_type = search_type
        self.searcher = searcher
        self.matcher = matcher
        self.highlight_items = highlight_items
        self.fragment_size = 200
        self.repo_location = repo_location
Exemple #17
0
def get_schema():
    return Schema(titulo=TEXT(stored=True), imagen=TEXT(stored=True), rango_web=TEXT(stored=True), 
                  popularidad=TEXT(stored=True), fecha_inicio=DATETIME(stored=True), fecha_final=DATETIME(stored=True), 
                  episodios=TEXT(stored=True), sinopsis=TEXT(stored=True), generos=KEYWORD(stored=True))
Exemple #18
0
def get_schema():
    return Schema(titulo=TEXT(stored=True),
                  fecha=DATETIME(stored=True),
                  enlace=TEXT(stored=True),
                  resumen=TEXT(stored=True),
                  nombrefichero=ID(stored=True))
Exemple #19
0
def schemaChampions():
    schema = Schema(idChampion=NUMERIC(stored=True),
                    name=TEXT(stored=True),
                    image=TEXT(stored=True),
                    releaseDate=DATETIME(stored=True))
    return schema
Exemple #20
0
def get_response_schema():
    return Schema(link_tema=ID(stored=True),
                  fecha=DATETIME(stored=True),
                  texto=TEXT(stored=True),
                  autor=TEXT(stored=True))
Exemple #21
0
def get_schema():
    return Schema(remitente=TEXT(stored=True), destinatarios=TEXT(stored=True), fecha=DATETIME(stored=True),
                  asunto=TEXT(stored=True), contenido=TEXT(stored=True), nombrefichero=ID(stored=True))
def get_schema():
    return Schema(titulo=TEXT(stored=True), description = TEXT (stored=True), categoria=TEXT(stored=True), fecha=DATETIME(stored=True))
Exemple #23
0
def get_schema_correo():
    return Schema(remitente=TEXT(stored=True),
                  destinatarios=KEYWORD(stored=True),
                  fecha=DATETIME(stored=True),
                  asunto=TEXT(stored=True),
                  contenido=TEXT(stored=True))
Exemple #24
0
class SearchIndexer:
    """Full-text search indexer."""

    # schema for searches of all (public + private) info
    SCHEMA = Schema(
        type=ID(stored=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        text_private=TEXT(),
        changed=DATETIME(),
    )

    # schema for searches of public info only
    SCHEMA_PUBLIC = Schema(
        type=ID(stored=True),
        handle=ID(stored=True, unique=True),
        private=BOOLEAN(stored=True),
        text=TEXT(),
        changed=DATETIME(),
    )

    def __init__(self, index_dir=FilenameOrPath):
        """Initialize given an index dir path."""
        self.index_dir = Path(index_dir)
        self.index_dir.mkdir(exist_ok=True)
        # query parser for all (public + private) content
        self.query_parser_all = MultifieldParser(["text", "text_private"],
                                                 schema=self.SCHEMA)
        # query parser for public content only
        self.query_parser_public = QueryParser("text",
                                               schema=self.SCHEMA_PUBLIC)

    def index(self, overwrite=False):
        """Return the index; create if doesn't exist."""
        index_dir = str(self.index_dir)
        if overwrite or not index.exists_in(index_dir):
            return index.create_in(index_dir, self.SCHEMA)
        return index.open_dir(index_dir)

    def reindex_full(self, db_handle: DbReadBase):
        """Reindex the whole database."""
        with self.index(overwrite=True).writer() as writer:
            for obj_dict in iter_obj_strings(db_handle):
                writer.add_document(
                    type=obj_dict["class_name"].lower(),
                    handle=obj_dict["handle"],
                    text=obj_dict["string"],
                    text_private=obj_dict["string_private"],
                    changed=obj_dict["changed"],
                )

    @staticmethod
    def format_hit(hit: Hit) -> Dict[str, Any]:
        """Format a search hit."""
        return {
            "handle": hit["handle"],
            "object_type": hit["type"],
            "rank": hit.rank,
            "score": hit.score,
        }

    def search(
        self,
        query: str,
        page: int,
        pagesize: int,
        include_private: bool = True,
        extend: bool = False,
    ):
        """Search the index.

        If `include_private` is true, include also private objects and
        search in private fields.
        """
        query_parser = (self.query_parser_all
                        if include_private else self.query_parser_public)
        query_parser.add_plugin(DateParserPlugin())
        # if private objects should not be shown, add a mask
        mask = None if include_private else Term("private", True)
        parsed_query = query_parser.parse(query)
        with self.index().searcher() as searcher:
            results = searcher.search_page(parsed_query,
                                           page,
                                           pagesize,
                                           mask=mask)
            return results.total, [self.format_hit(hit) for hit in results]
Exemple #25
0
def get_schema():
    return Schema(titulo=TEXT(stored=True), fechaInicio=DATETIME(stored=True), fechaFin=DATETIME(stored=True),
                  descripcion=TEXT(stored=True), categorias=TEXT(stored=True))
Exemple #26
0
    # hierarchical index of ids path ('/' is the separator)
    parent_ids = FieldType(format=Existence(),
                           analyzer=PathTokenizer(),
                           stored=True,
                           unique=False)

    name = TEXT(stored=True, analyzer=accent_folder)
    slug = ID(stored=True)
    description = TEXT(stored=True, analyzer=accent_folder)
    text = TEXT(stored=False, analyzer=accent_folder)


_default_dyn_fields = {
    '*_prefix': EdgeNgramField(),
    '*_at': DATETIME(stored=True, sortable=True),
}


def DefaultSearchSchema(*args, **kwargs):
    schema = _DefaultSearchSchema()
    for name, field in _default_dyn_fields.items():
        schema.add(name, field, glob=True)
    return schema


def indexable_role(principal):
    """
  Returns a string suitable for query against `allowed_roles_and_users`
  field.
Exemple #27
0
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t


def StanfordAnalyzer(lowercase=False):
    tokenizer = StanfordTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer


schema = Schema(id=ID(stored=True),
                path=ID(stored=True),
                body=TEXT(analyzer=StanfordAnalyzer()),
                year=DATETIME(stored=True),
                tags=KEYWORD(stored=True),
                names=KEYWORD(stored=True))

if __name__ == '__main__':

    if not os.path.exists('../TIMEindex'):
        os.mkdir('../TIMEindex')

    ix = index.create_in('../TIMEindex', schema=schema, indexname="TIME")
    ix = index.open_dir('../TIMEindex', indexname="TIME")
    writer = ix.writer()

    for decade in os.listdir('../rich_texts_txt'):
        if decade.startswith('.'): continue
        path = os.path.join('../rich_texts_txt', decade)
Exemple #28
0
def get_schema():
    return Schema(title=TEXT(stored=True), date_start=DATETIME(stored=True), date_end=DATETIME(stored=True),
                  description=TEXT(stored=True), categoria=KEYWORD(stored=True))
Exemple #29
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Exemple #30
0
from whoosh.fields import Schema, TEXT, ID, DATETIME, NUMERIC
from whoosh.qparser import QueryParser
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from tqdm import tqdm
from ..scraper.model import Message
from .cleaning import clean_message

# Setup Index
schema = Schema(
    list_id=ID(stored=True),
    message_id=ID(stored=True),
    content=TEXT(stored=True),
    author=TEXT(stored=True),
    subject=TEXT(stored=True),
    sent_at=DATETIME(stored=True),
    thread_parent=NUMERIC(stored=True),
    thread_idx=NUMERIC(stored=True),
    thread_indent=NUMERIC(stored=True),
    page=TEXT(stored=True),
)


def open_index(index_dir):
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
        return index.create_in(index_dir, schema)
    else:
        return index.open_dir(index_dir)