Esempio n. 1
0
 def get_schema(self):
     return Schema(
         title=TEXT(stored=True, field_boost=1.5),
         name=ID(stored=True),
         path=ID(stored=True),
         content=TEXT(stored=True, analyzer=StemmingAnalyzer()),
         keywords=KEYWORD(stored=True, scorable=True, commas=True),
     )
Esempio n. 2
0
class TMSchema(SchemaClass):
    """Fultext index schema for source and context strings."""
    source_language = ID(stored=True)
    target_language = ID(stored=True)
    source = TEXT(stored=True)
    target = STORED()
    origin = ID(stored=True)
    category = NUMERIC(stored=True)
Esempio n. 3
0
def init_extensions(app):
    global use_cache
    whoosh_searcher.init_app(app)
    configure_uploads(app, upload_photos)
    mail.init_app(app)
    admin.init_app(app)
    mongo.init_app(app, "MONGO")
    oauth.init_app(app)
    login_manager.init_app(app)
    # use_cache = app.config.get('USE_CACHE', False)
    # if use_cache:
    #     cache.init_app(app, {})

    with app.app_context():
        # 添加flask-admin视图
        admin.add_view(admin_view.RolesModelView(mongo.db['roles'], '角色管理'))
        admin.add_view(admin_view.UsersModelView(mongo.db['users'], '用户管理'))
        admin.add_view(
            admin_view.CatalogsModelView(mongo.db['catalogs'],
                                         '栏目管理',
                                         category='内容管理'))
        admin.add_view(
            admin_view.PostsModelView(mongo.db['posts'],
                                      '帖子管理',
                                      category='内容管理'))
        admin.add_view(
            admin_view.PassagewaysModelView(mongo.db['passageways'],
                                            '温馨通道',
                                            category='推广管理'))
        admin.add_view(
            admin_view.FriendLinksModelView(mongo.db['friend_links'],
                                            '友链管理',
                                            category='推广管理'))
        admin.add_view(
            admin_view.PagesModelView(mongo.db['pages'],
                                      '页面管理',
                                      category='推广管理'))
        admin.add_view(
            admin_view.FooterLinksModelView(mongo.db['footer_links'],
                                            '底部链接',
                                            category='推广管理'))
        admin.add_view(
            admin_view.AdsModelView(mongo.db['ads'], '广告管理', category='推广管理'))
        admin.add_view(admin_view.OptionsModelView(mongo.db['options'],
                                                   '系统设置'))

        # 初始化Whoosh索引
        chinese_analyzer = ChineseAnalyzer()
        post_schema = Schema(obj_id=ID(unique=True, stored=True),
                             title=TEXT(stored=True,
                                        analyzer=chinese_analyzer),
                             content=TEXT(stored=True,
                                          analyzer=chinese_analyzer),
                             create_at=DATETIME(stored=True),
                             catalog_id=ID(stored=True),
                             user_id=ID(stored=True))
        whoosh_searcher.add_index('posts', post_schema)
Esempio n. 4
0
 def schema(self):
     my_analyzer = RegexTokenizer("[a-zA-Z_]+") | LowercaseFilter() | StopFilter()
     schema = Schema(
         h=TEXT(stored=True, analyzer=my_analyzer),
         gnx=ID(stored=True), b=TEXT(analyzer=my_analyzer),
         parent=ID(stored=True),
         doc=ID(stored=True),
     )
     return schema
Esempio n. 5
0
 def _get_generic_schema(self):
     """ Returns whoosh's generic schema of the partition document. """
     schema = Schema(
         vid=ID(stored=True, unique=True),
         dataset_vid=ID(stored=True),  # dataset_vid? Convert if so.
         title=NGRAMWORDS(),
         keywords=KEYWORD,
         doc=TEXT)  # Generated document for the core of the topic search
     return schema
Esempio n. 6
0
 def __init__(self):
     self.file_index = None
     self.schema = Schema(floor_id=ID(stored=True),
                          user_id=ID(stored=True),
                          user_name=ID(stored=True),
                          floor_content=TEXT(stored=True, analyzer=ChineseAnalyzer()),
                          tie_id=ID(stored=True))
     # 直接将file_index初始化
     self.open_index()
Esempio n. 7
0
class FileSchema(SchemaClass):
    "remote files"
    path = ID(stored=True)  # without tree_path
    checksum = ID(stored=True)
    size = NUMERIC(bits=64, signed=False, stored=True)
    tree = ID(stored=True)
    tree_path = ID(stored=True)
    mtime = NUMERIC(stored=True)
    pubkey = ID(stored=True)  # only needed for plants
Esempio n. 8
0
    def prepare_indices(self, build_index, path):
        if build_index:
            print("Indexing corpus...")
            schema = None
            if self.lang == "ja":
                schema = Schema(path=ID(stored=True),
                                content=NGRAM(stored=True))
            else:
                ana = analysis.StandardAnalyzer(stoplist=None, minsize=0)
                schema = Schema(path=ID(stored=True),
                                content=TEXT(analyzer=ana))
            index_directory = os.path.dirname(path) + "/tmp/indices/indexdir"
            if not os.path.exists(index_directory):
                os.makedirs(index_directory)
            self.ix = create_in(index_directory, schema)
            with self.ix.writer(limitmb=2048, multisegment=True) as writer:
                i = 0
                for utterance in log_progress(self.utterances):
                    writer.add_document(path=str(i), content=utterance.text)
                    i += 1

            print("Indexing corpus by lemma...")
            if self.lang == "ja":
                schema = Schema(path=ID(stored=True),
                                content=NGRAM(stored=True))
            else:
                ana = analysis.StandardAnalyzer(stoplist=None, minsize=0)
                schema = Schema(path=ID(stored=True),
                                content=TEXT(analyzer=ana))
            lemma_index_directory = os.path.dirname(path) + \
                "/tmp/indices/lemmaindexdir"
            if not os.path.exists(lemma_index_directory):
                os.makedirs(lemma_index_directory)
            self.ix_lemma = create_in(lemma_index_directory, schema)
            with self.ix_lemma.writer(limitmb=2048,
                                      multisegment=True) as writer:
                i = 0
                for utterance in log_progress(self.utterances):
                    lemmas = [token.lemma_ for token in utterance.spacy]
                    writer.add_document(path=str(i), content=" ".join(lemmas))
                    i += 1
        else:
            print("Loading indices...")
            index_directory = os.path.dirname(path) + "/tmp/indices/indexdir"
            if not os.path.exists(index_directory):
                raise IOError('No existing indices! You should build ' +
                              'indices before trying to load them.')
            self.ix = open_dir(index_directory)

            print("Loading lemma indices...")
            index_directory = os.path.dirname(path) + \
                "/tmp/indices/lemmaindexdir"
            if not os.path.exists(index_directory):
                raise IOError('No existing indices! You should build ' +
                              'indices before trying to load them.')
            self.ix_lemma = open_dir(index_directory)
Esempio n. 9
0
def _get_schema():
    analyzer = StemmingAnalyzer() | CharsetFilter(
        accent_map
    )  # WARN: stemming is english specific; character folding is for western languages
    schema = Schema(
        code=ID(unique=True, stored=True),
        slug=ID(unique=False, stored=True),
        title=TEXT(analyzer=analyzer, stored=True),
        content=TEXT(analyzer=analyzer),
    )
    return schema
Esempio n. 10
0
 def test_2_1(self):
     source = Schema(
         checksum=ID(stored=True, unique=True),
         source=TEXT(),
         context=TEXT(),
     )
     target = Schema(
         checksum=ID(stored=True, unique=True),
         target=TEXT(),
     )
     self.do_test(source, target)
Esempio n. 11
0
class NoteSchema(SchemaClass):
    id = ID(stored=True, unique=True)
    created = DATETIME()
    title = TEXT(analyzer=custom_analyzer, spelling=True)
    contents = TEXT(spelling=True)
    public = BOOLEAN()
    draft = BOOLEAN()
    short_code = ID(stored=True, unique=True)
    disable = BOOLEAN()
    reported = BOOLEAN()
    tags = TEXT(analyzer=tag_analyzer, spelling=True)
    user = ID()
Esempio n. 12
0
def get_schema(schema):
    kwargs = {}
    for key, value in schema.items():
        if value == "indexed":
            kwargs[key] = NGRAMWORDS(minsize=2, sortable=True)
        elif value == "id_stored":
            kwargs[key] = ID(stored=True)
        elif value == "unique_id_stored":
            kwargs[key] = ID(unique=True, stored=True)
        elif value == "boolean":
            kwargs[key] = BOOLEAN(stored=True)
    return Schema(**kwargs)
Esempio n. 13
0
    def __init__(self, articles_path):
        """Attempt to initialize a folder with Markdown articles. If a git
        repo, create a search index and populate.

        Markdown Extension References
        * http://facelessuser.github.io/pymdown-extensions
        * https://pythonhosted.org/Markdown/extensions
        """
        self.article_repo = Repo(articles_path)
        self.articles_path = articles_path
        self.markdown_extensions = [
            'markdown.extensions.abbr',
            'markdown.extensions.attr_list',
            'markdown.extensions.def_list',
            'markdown.extensions.fenced_code',
            'markdown.extensions.footnotes',
            'markdown.extensions.tables',
            'markdown.extensions.smart_strong',
            'markdown.extensions.admonition',
            'markdown.extensions.codehilite',
            'markdown.extensions.headerid',
            'markdown.extensions.sane_lists',
            'markdown.extensions.smarty',
            'markdown.extensions.toc',
            'markdown.extensions.wikilinks',
            'pymdownx.betterem',
            'pymdownx.caret',
            'pymdownx.githubemoji',
            'pymdownx.headeranchor',
            'pymdownx.magiclink',
            'pymdownx.mark',
            'pymdownx.smartsymbols',
            'pymdownx.tasklist',
            'pymdownx.tilde',
            'pymdownx.critic',
        ]
        self.markdown_extensions_config = {
            'markdown.extensions.codehilite': {
                'css_class': 'code-highlight'
            }
        }
        self.__search_schema = Schema(
            title=ID(stored=True, unique=True),
            path=ID(stored=True),
            content=TEXT,
        )
        self.__search_parser = MultifieldParser(
            ['title', 'content'],
            schema=self.__search_schema,
        )
        self.__search_parser.add_plugin(FuzzyTermPlugin())
        self.__search_index = self.create_search_index()
        self.populate_search_index()
Esempio n. 14
0
def build_schema_and_corpus():
    schema = Schema(
        id=ID(stored=True),
        filename=ID(stored=True),
        story=TEXT(analyzer=StemmingAnalyzer(), stored=True, lang="en"),
    )
    file = os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json"
    data = ({
        "id": d["id"],
        "filename": d["filename"],
        "story": d["story"]
    } for d in data_io.read_json(file)["data"])
    return schema, data
Esempio n. 15
0
 def __get_index_schema(self):
     """
     :return: organization index schema
     """
     return Schema(id=NUMERIC(stored=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   name=ID(stored=True),
                   domain_names=KEYWORD(stored=True, commas=True),
                   created_at=ID(stored=True),
                   details=ID(stored=True),
                   shared_tickets=BOOLEAN(stored=True),
                   tags=KEYWORD(stored=True, commas=True))
Esempio n. 16
0
 def _get_schema(self):
     stem_ana = StemmingAnalyzer()
     return Schema(
         list_name=ID(stored=True),
         message_id=ID(stored=True),
         sender=TEXT(field_boost=1.5),
         user_id=TEXT,
         subject=TEXT(field_boost=2.0, analyzer=stem_ana),
         content=TEXT(analyzer=stem_ana),
         date=DATETIME(),
         attachments=TEXT,
         tags=KEYWORD(commas=True, scorable=True),
     )
Esempio n. 17
0
 def _get_schema(self, language):
     lang_analyzer = LanguageAnalyzer(language)
     return Schema(
         key=ID(stored=True, unique=True),
         assignee=ID(stored=True),
         reporter=ID(stored=True),
         status=ID(stored=True),
         summary=TEXT(analyzer=lang_analyzer, field_boost=2.0),
         description=TEXT(analyzer=lang_analyzer),
         comments_str=TEXT(analyzer=lang_analyzer),
         labels=KEYWORD(stored=True, lowercase=True),
         components=KEYWORD(stored=True, lowercase=True),
     )
Esempio n. 18
0
 def _mail_schema(self):
     return Schema(
         ident=ID(stored=True, unique=True),
         sender=ID(stored=False),
         to=KEYWORD(stored=False, commas=True),
         cc=KEYWORD(stored=False, commas=True),
         bcc=KEYWORD(stored=False, commas=True),
         bounced=KEYWORD(stored=False, commas=True),
         subject=TEXT(stored=False),
         date=NUMERIC(stored=False, sortable=True, bits=64, signed=False),
         body=TEXT(stored=False),
         tag=KEYWORD(stored=True, commas=True),
         flags=KEYWORD(stored=True, commas=True),
         raw=TEXT(stored=False))
Esempio n. 19
0
def main():
    indexdir = 'indexdir'
    if os.path.exists(indexdir):
        index = open_dir(indexdir)
    else:
        schema = Schema(doc_id=ID(unique=True, stored=True),
                        url=ID(unique=True, stored=True),
                        title=TEXT(stored=True),
                        body=TEXT())

        os.mkdir(indexdir)
        index = create_in(indexdir, schema)

    incremental_index_msmacro(index)
Esempio n. 20
0
class RecipeSchema(SchemaClass):
    id = ID(stored=True, unique=True)
    created = DATETIME()
    title = TEXT(analyzer=custom_analyzer)
    description = TEXT(analyzer=custom_analyzer)
    public = BOOLEAN()
    deleted = BOOLEAN()
    reported = BOOLEAN()
    short_code = ID(stored=True, unique=True)
    tags = TEXT(analyzer=tag_analyzer)
    user = ID()
    steps = TEXT(analyzer=custom_analyzer)
    ingredients = TEXT(analyzer=custom_analyzer)
    country = TEXT()
Esempio n. 21
0
def CreateSchemaInitIndex():
	print "Creating schema"
	my_schema = Schema(id = ID(unique = True, stored = True),\
				path = ID(stored = True),\
				source = ID(stored = True),\
				author = TEXT(stored = True),\
				title = TEXT(stored = True),\
				year = TEXT(stored = True),\
				text = TEXT)
	print my_schema

	if not os.path.exists("gutenbergindex"):
		os.mkdir("gutenbergindex")
	index = create_in("gutenbergindex", my_schema)
Esempio n. 22
0
def mkSchema():
    schema = Schema(
        date=ID(stored=True),
        city=TEXT(stored=True),
        state=TEXT(stored=True),
        country=TEXT(stored=True),
        shape=TEXT(stored=True),
        durationSecs=NUMERIC(stored=True),
        # RAUSGENOMMEN durationHoursMins=TEXT(stored=True), # eventuell rausnehmen, da Sekunden exakter sind
        comments=TEXT(analyzer=StemmingAnalyzer(),
                      stored=True),  # oder Standardanalyzer
        datePosted=ID(stored=True),
        latitude=NUMERIC(float, stored=True),
        longitude=NUMERIC(float, stored=True))
    return schema
Esempio n. 23
0
def get_cache_schema():
    schema = Schema(
        key=ID(unique=True, stored=True),  # Copied from Zotero.
        version=NUMERIC(stored=True),  # Copied from Zotero.
        parentItem=ID(stored=True),  # Kerko addition.
        itemType=ID(stored=True),  # Kerko addition.
        library=STORED,  # Copied from Zotero & JSON-encoded.
        links=STORED,  # Copied from Zotero & JSON-encoded.
        meta=STORED,  # Copied from Zotero & JSON-encoded.
        data=STORED,  # Copied from Zotero & JSON-encoded.
        fulltext=STORED,  # Kerko addition.
    )
    for format_ in get_formats():
        schema.add(format_, STORED)
    return schema
Esempio n. 24
0
def populate_whoosh(text_dir, whoosh_dir):
    loaded = 0

    ## Create analyzer used for tokenizing and normalizing tokens
    my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter()
                   | analysis.StopFilter())

    # Create schema
    schema = Schema(url=ID(stored=True),
                    body=TEXT(stored=True, analyzer=my_analyzer))

    # Setup index
    os.makedirs(whoosh_dir, exist_ok=True)
    ix = index.create_in(whoosh_dir, schema)

    # Clear index
    writer = ix.writer()
    writer.commit(mergetype=writing.CLEAR)

    # Index documents
    writer = ix.writer()
    for root, dirs, files in os.walk(text_dir, topdown=False):
        for name in files:
            text_file = os.path.join(root, name)
            with open(text_file, encoding="utf8") as tf:
                body = tf.read()
                url = text_file.replace(text_dir, "")
                writer.add_document(url=url, body=body)
                print("Added", url)
                loaded += 1

    writer.commit()
    print("\n\nLoaded", loaded, "documents")
Esempio n. 25
0
 def build_indexes(index, hord):
     if not os.path.exists(index):
         os.mkdir(index)
     index = create_in(
         index,
         Schema(
             quote=TEXT(stored=True),
             id=ID(stored=True),
             submitter=STORED,
             submitted=STORED,
         ),
     )
     corpus = []
     with index.writer() as writer:
         LOGGER.info("Building Whoosh index and markov model from hord.")
         for row in hord.get_rows():
             corpus.append(row.quote)
             if row.submitted:
                 submitted = row.submitted.strftime("%b %d %Y %H:%M:%S")
             else:
                 submitted = None
             writer.update_document(
                 quote=row.quote,
                 id=str(row.id),
                 submitter=(row.submitter),
                 submitted=(submitted),
             )
     LOGGER.info(f"Index built. {index.doc_count()} documents indexed.")
     if len(corpus) > 0:
         model = markovify.NewlineText("\n".join(corpus))
     else:
         model = None
     LOGGER.info(f"Markov model built.")
     return index, model
Esempio n. 26
0
def get_schema():
    """ Return a schema used for indexing document """
    analyzer = MyVietnameseTokenizer() | LowercaseFilter() | StopFilter(get_stopword_list())
    return Schema(title=TEXT(analyzer=analyzer, stored=True, field_boost=1.5),
                  path=ID(unique=True, stored=True),
                  time=STORED,
                  content=TEXT(analyzer=analyzer, stored=True))
Esempio n. 27
0
class TargetSchema(SchemaClass):
    '''
    Fultext index schema for target strings.
    '''
    checksum = ID(stored=True, unique=True)
    target = TEXT()
    comment = TEXT()
Esempio n. 28
0
 def __init__(self, IndexDir="../index"):
     self.IndexDir = IndexDir
     from whoosh.fields import Schema, TEXT, ID
     self.schema = Schema(title=TEXT(stored=True),
                          path=ID(stored=True),
                          content=TEXT)
     self.ix = index.open_dir(IndexDir)
Esempio n. 29
0
    def fields_map(self, field_type):
        if field_type == "primary":
            return ID(stored=True, unique=True)
        type_map = {
            'date': types.Date,
            'datetime': types.DateTime,
            'boolean': types.Boolean,
            'integer': types.Integer,
            'float': types.Float
        }
        if isinstance(field_type, str):
            field_type = type_map.get(field_type, types.Text)

        if not isinstance(field_type, type):
            field_type = field_type.__class__

        if issubclass(field_type, (types.DateTime, types.Date)):
            return DATETIME(stored=True, sortable=True)
        elif issubclass(field_type, types.Integer):
            return NUMERIC(stored=True, numtype=int)
        elif issubclass(field_type, types.Float):
            return NUMERIC(stored=True, numtype=float)
        elif issubclass(field_type, types.Boolean):
            return BOOLEAN(stored=True)
        return TEXT(stored=True, analyzer=self.analyzer, sortable=False)
Esempio n. 30
0
class SourceSchema(SchemaClass):
    '''
    Fultext index schema for source and context strings.
    '''
    checksum = ID(stored=True, unique=True)
    source = TEXT()
    context = TEXT()