Ejemplo n.º 1
0
class NewsSchema(SchemaClass):
    newsAgency = TEXT
    title = TEXT(stored=True)
    summary = TEXT(stored=True)
    url = ID(stored=True)
    content = TEXT
Ejemplo n.º 2
0
class SourceSchema(SchemaClass):
    """Fultext index schema for source and context strings."""
    pk = NUMERIC(stored=True, unique=True)
    source = TEXT()
    context = TEXT()
    location = TEXT()
def get_schema():
    return Schema(remitente=TEXT(stored=True),
                  destinatarios=KEYWORD(stored=True),
                  asunto=TEXT(stored=True),
                  contenido=TEXT(stored=True))
import os.path

from whoosh.fields import Schema, TEXT
from whoosh.index import create_in, open_dir
from .modeles.data_db import data as db

# Il a été déterminé que l'indexation via un moteur de recherche plein texte ne serait utile que pour la recherche
# via nom de ville.

# Définition du schéma du moteur de recherche. Le code du pays sera indexé et le contenu
# sera retourné en fonction du mot indexé (stored). "City" sert à l'indexation des noms de ville.
# "Name" sert au type de représentation. Le contenu de "Content" servira à produire le marqueur de la carte.
# Whoosh ne peut pas indexer des chaînes de caractères en UTF-8.
schema = Schema(city=TEXT, name=TEXT(stored=True), content=TEXT(stored=True))

# L'indexation n'est lancée que si le dossier "index" n'existe pas (la documentation Whoosh conseille de
# stocker l'index dans un dossier comme cela). L'indexation n'est donc lancée qu'une seule fois dans le cycle de vie
# de l'application installée en local. Si on refaisait l'indexation à chaque lancement de l'application, cela
# gaspillerait des ressources.
villes = []
if not os.path.exists("index"):
    # Ce print ne doit s'afficher que lorsque l'index est écrit, c'est-à-dire lors du 1er lancement de l'application,
    # ou lors d'un changement des données décidé par l'utilisateur. Logiquement, il apparaît lors du lancement des
    # tests.
    print("Création du dossier 'index'.")
    os.mkdir("index")
    index = create_in("index", schema)
    # On ouvre l'index vide (qui a maintenant un schéma) pour y ajouter ce qu'on veut indexer.
    index = open_dir("index")
    writer = index.writer()
    # Ajout des documents indexés selon les villes. Le contenu est le nom de la représentation diplomatique concernée.
Ejemplo n.º 5
0
"""File name is deals_ + datetime with YearMonthDay"""
daystr = datetime.date.today().strftime('%Y%m%d')
filename = 'deals_' + daystr + '.jl'
rootFolder = 'C:\crawlData\\'
oldFiles = Path(rootFolder).files('*.jl')
oldFiles[0].remove()
print('Removed {} file').format(oldFiles[0])
fullFilePath = rootFolder + filename
"""Download deals from S3"""
session = boto3.Session(profile_name='indexingProf')
s3_client = session.client('s3')
s3_client.download_file('home-deals', 'deals/' + filename, fullFilePath)

# ixDirectory = 'indexed_'+daystr
ixDirectory = rootFolder + 'indexed'
dealSchema = Schema(title=TEXT(stored=True),
                    img=ID(stored=True),
                    link=TEXT(stored=True),
                    price=ID(stored=True))
if not os.path.exists(ixDirectory):
    os.mkdir(ixDirectory)
ix = create_in(ixDirectory, dealSchema)
writer = ix.writer()
"""Configuration for indexing full-text search by whoosh"""
with open(fullFilePath) as file:
    for line in file:
        try:
            lineData = json.loads(line)
            title = lineData['title']
            img = lineData['img']
            link = lineData['link']
from whoosh.fields import Schema
from whoosh.fields import ID, TEXT
from whoosh.index import open_dir, create_in
from whoosh.analysis import StopFilter
from whoosh.analysis import RegexTokenizer
from whoosh.qparser import QueryParser
from collections import Counter

#=============Input===========


#=============UAT Indexing===========
my_schema = Schema(id = ID(unique=True, stored=True), 
                   path = ID(stored=True), 
                   source = ID(stored=True),
                   author = TEXT(stored=True),
                   title = TEXT(stored=True),
                   text = TEXT)


ix = create_in("index", my_schema)
index = open_dir("index")
writer = index.writer()

import io
writer.add_document(id = u'uat_voc',
                    path = u'sample/uat_voc.txt',
                    source = u'uat_voc.txt',
                    title = u'uat_voc',
                    text = io.open('uat_voc.txt', encoding='utf-8').read())
writer.commit()
Ejemplo n.º 7
0
 def schema_type(self):
     return TEXT(stored=True, analyzer=SimpleAnalyzer())
Ejemplo n.º 8
0
class BmarkSchema(SchemaClass):
    bid = ID(unique=True, stored=True)
    description = TEXT
    extended = TEXT
    tags = KEYWORD
    readable = TEXT(analyzer=StemmingAnalyzer())
Ejemplo n.º 9
0
from whoosh.query import Variations

from whoosh.support.charset import accent_map
from whoosh.analysis import RegexTokenizer
from whoosh.analysis import CharsetFilter, LowercaseFilter, StopFilter
from newebe.lib.stopwords import stoplists

from newebe.config import CONFIG

logger = logging.getLogger("newebe.lib")

chfilter = CharsetFilter(accent_map)
stoplist = stoplists["en"].union(stoplists["fr"])
analyzer = RegexTokenizer() | LowercaseFilter() | \
           StopFilter(stoplist=stoplist) | chfilter
schema = Schema(content=TEXT(analyzer=analyzer),
                docType=TEXT,
                docId=ID(stored=True),
                tags=KEYWORD)


class Indexer():
    """
    Indexer simplifies objects indexation and search with the whoosh api.
    """
    def __init__(self):
        """
        Set index, create it if it does not exists.
        """

        if CONFIG.main.debug:
Ejemplo n.º 10
0
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh.filedb.filestore import FileStorage

schema = Schema(teorArtigo=TEXT(analyzer=StemmingAnalyzer()),
                numArtigo=TEXT(analyzer=StemmingAnalyzer()),
                pergunta=TEXT(analyzer=StemmingAnalyzer()),
                idResposta=ID(stored=True))

storage = FileStorage("index")
ix = storage.create_index(schema)

ix = storage.open_index()
writer = ix.writer()
Ejemplo n.º 11
0
    with open(file) as f:
        for line_no, line in enumerate(f):
            line = line.strip().split("\t")
            if line_no == 0:
                header = line
            else:
                data.append(dict(zip(header, line)))
    return header, data


FIELDS, DATA = parse()
PAGELEN = 100

if automatic:
    schema = Schema(**{
        header: TEXT(stored=True)
        for header in FIELDS
    })
else:
    schema = Schema()


def create_index(data=DATA, _schema=schema):
    if not os.path.exists(_indexdir):
        os.mkdir(_indexdir)
    ix = index.create_in(_indexdir, _schema)

    writer = ix.writer()

    for elem in data:
        writer.add_document(**elem)
import sys
import json
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh import index
from whoosh.qparser import QueryParser
from whoosh import scoring
import os, os.path

#####################################
#Create the schema
#####################################
schema = Schema(filename=ID(stored=True),
                cell_no=TEXT(stored=True),
                content=TEXT(analyzer=StemmingAnalyzer()))

#####################################
# Create the index and initialize a `writer`
#####################################

# Note, this clears the existing index in the directory
ix = index.create_in("notebooks", schema)

# Get a writer form the created index in
writer = ix.writer()


def visibleTextFromNB(filename):
    '''
    This function pulls all the non-output visible cells from
    a JupyterNotebook and concatenates it all into a block of
Ejemplo n.º 13
0
class MySchema(SchemaClass):
        path = ID(stored=True)
        title = TEXT(stored=True)
        content = TEXT
        tags = KEYWORD
Ejemplo n.º 14
0
# -*- coding: utf-8 -*-

#http://blog.csdn.net/twsxtd/article/details/8308893

最近想做一个搜索引擎,当然少不了看下闻名遐迩的Lucene,不得不说确实非常出色,但是对于python的实现pylucene确是差强人意,首先它 不是纯python实现
而是做了一层包装到头来还是使用java,依赖于JDK不说安装步骤繁琐至极,而且Lucene可用的中文分词词库非常之多但是由 于这层粘合关系很多都用不上,
最终还是放弃,不过平心而论如果用Java实现的确很完美。其它的有sphinx以及基于它实现的专门用于中文的 coreseek,不过看了下好像是基于SQL语言的,
对于我要做的事情好像关系不大;还有用C++写的xapian框架,可以说是一片好评啊,速度精度 都非常不错,但最终还是看上了纯python实现的Whoosh,
首先对于python使用来说非常简单,就是一个模块,easy_install就行, 但是搜了一下国内的资料非常之少,没有办法,就把它的文档翻译一下吧~~今天开始

Quick Start
    Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口
下面是一个简短的例子:
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT)
ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注)
writer = ix.writer()
writer.add_document(title=u"First document",path=u"/a",
                    content = u"this is the first document we've add!")
writer.add_document(title=u"Second document", path=u"/b",
                        ...                     content=u"The second one is even more interesting!")
writer.commit()
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    results[0]
{"title": u"First document", "path": u"/a"}

Index和Schema对象
import os
import sys
import json

from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.analysis import StemmingAnalyzer

# three fields: youtube id, video title, video description
stem_analyzer = StemmingAnalyzer()
schema = Schema(id=ID(stored=True),
                title=TEXT(stored=True),
                description=TEXT(analyzer=stem_analyzer, stored=True),
                topic=ID(stored=True))

# create a folder to store index
if not os.path.exists("indexdirectory"):
    os.mkdir("indexdirectory")

# create index writer
ix = open_dir("indexdirectory")
writer = ix.writer()

with open('data_for_indexing3.json') as f:
    youtube_array = json.load(f)
    for item in youtube_array:
        writer.add_document(id=item['id'],
                            title=item['title'],
                            description=item['description'],
                            topic=item['topic'])
Ejemplo n.º 16
0
def get_more_search_result():
    query = request.form['query']
    q = []
    q.append(query)
    page_offset = int(request.form['page_offset'])
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    path_array = []
    preview_array = []
    date_array = []
    size_array = []
    list_tags = []

    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(" ".join(q))
        results = searcher.search_page(query, page_offset, num_elem_to_get)
        for x in results:
            path = x.items()[0][1]
            path = path.replace(PASTES_FOLDER, '', 1)
            path_array.append(path)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
            preview_array.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:]
            date_array.append(curr_date)
            size_array.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:'+path)
            l_tags = []
            for tag in p_tags:
                complete_tag = tag
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append( (tag, complete_tag) )
            list_tags.append(l_tags)

        to_return = {}
        to_return["path_array"] = path_array
        to_return["preview_array"] = preview_array
        to_return["date_array"] = date_array
        to_return["size_array"] = size_array
        to_return["list_tags"] = list_tags
        to_return["bootstrap_label"] = bootstrap_label
        if len(path_array) < num_elem_to_get: #pagelength
            to_return["moreData"] = False
        else:
            to_return["moreData"] = True

    return jsonify(to_return)
Ejemplo n.º 17
0
def get_schema():
    return Schema(nome=TEXT(stored=True),
                  id=ID(stored=True),
                  lat=NUMERIC(stored=True),
                  lon=NUMERIC(stored=True))
Ejemplo n.º 18
0
def search():
    query = request.form['query']
    q = []
    q.append(query)
    r = [] #complete path
    c = [] #preview of the paste content
    paste_date = []
    paste_size = []
    paste_tags = []
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    ''' temporary disabled
    # Search filename
    for path in r_serv_pasteName.smembers(q[0]):
        r.append(path)
        paste = Paste.Paste(path)
        content = paste.get_p_content()
        content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
        c.append(content[0:content_range])
        curr_date = str(paste._get_p_date())
        curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:]
        paste_date.append(curr_date)
        paste_size.append(paste._get_p_size())
    '''

    # Search full line
    schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse("".join(q))
        results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
        for x in results:
            r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1))
            path = x.items()[0][1].replace(PASTES_FOLDER, '', 1)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
            c.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:]
            paste_date.append(curr_date)
            paste_size.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:'+path)
            l_tags = []
            for tag in p_tags:
                complete_tag = tag
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append( (tag, complete_tag) )

            paste_tags.append(l_tags)
        results = searcher.search(query)
        num_res = len(results)

    index_list = get_index_list()

    index_min = 1
    index_max = len(index_list)

    return render_template("search.html", r=r, c=c,
            query=request.form['query'], paste_date=paste_date,
            paste_size=paste_size, char_to_display=max_preview_modal,
            num_res=num_res, index_min=index_min, index_max=index_max,
            bootstrap_label=bootstrap_label,
            paste_tags=paste_tags,
            index_list=index_list
           )
Ejemplo n.º 19
0
    def _build_doc_attrs(self, Model, schema):
        mapper = sa.inspect(Model)

        args = self.doc_attrs
        # any field not in schema will be stored here. After all field have been
        # discovered we add missing ones
        field_definitions = dict()

        def setup_field(attr_name, field_name):
            field_def = False
            if not isinstance(field_name, string_types):
                field_name, field_def = field_name

            if field_name not in schema:
                if (field_name not in field_definitions
                        or field_definitions[field_name] is False):
                    field_definitions[field_name] = field_def

            # attrgetter offers dotted name support. Useful for attributes on
            # related objects.
            args.setdefault(field_name, {})[name] = attrgetter(name)

        # model level definitions
        for name, field_names in self.index_args.get("index_to", ()):
            if isinstance(field_names, string_types):
                field_names = (field_names, )
            for field_name in field_names:
                setup_field(name, field_name)

        # per column definitions
        for col in mapper.columns:
            name = col.name
            info = col.info

            if not info.get("searchable"):
                continue

            index_to = info.get("index_to", (name, ))
            if isinstance(index_to, string_types):
                index_to = (index_to, )

            for field_name in index_to:
                setup_field(name, field_name)

        # add missing fields to schema
        for field_name, field_def in field_definitions.items():
            if field_name in schema:
                continue

            if field_def is False:
                field_def = TEXT(stored=True, analyzer=accent_folder)

            logger.debug(
                "Adding field to schema:\n"
                "  Model: %s\n"
                '  Field: "%s" %s',
                Model._object_type(),
                field_name,
                field_def,
            )
            schema.add(field_name, field_def)
Ejemplo n.º 20
0
        .map(lambda row: (row.title, get_from(row.title, sec_redirs), row.revision.text._VALUE)) \
        .filter(lambda row: row[1] != [])

texts = texts_rdd.collect()

############
############ VYHLADAVANIE
############

from whoosh import index
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import QueryParser

schema = Schema(title=ID(stored=True),
                r_from=TEXT(stored=True),
                text=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

print("creating reversed index...")

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)
writer = ix.writer()

for text in texts:
    writer.add_document(title=text[0],
                        r_from=' '.join(map(str, text[1] + ", ")),
                        text=text[2],
Ejemplo n.º 21
0
'''

import pandas as pd
import time
import xml.dom.minidom
from xml.dom.minidom import parse
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
import os, os.path
from whoosh import index



if not os.path.exists("indexFolder"):
    os.mkdir("indexFolder")

schema = Schema(docid=ID(stored=True), title=TEXT(stored=True), body=TEXT(stored=True))
ix = index.create_in("indexFolder", schema)
ix = index.open_dir("indexFolder")
writer = ix.writer()

start_time = time.time()
last_time = start_time

indexLog = []
fw = open('indexLog.txt','w')

for i in range(462):
    fiName = "WIR\WebIR-%03d.xml"
    if not os.path.exists(fiName % i):
        continue
    dom_data = xml.dom.minidom.parse(fiName % i)
Ejemplo n.º 22
0
    def build_schema(self, fields):
        # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = WHOOSH_ID(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=getattr(field_class, "analyzer",
                                     StemmingAnalyzer()),
                    field_boost=field_class.boost,
                    sortable=True,
                )
                schema_fields[
                    field_class.index_fieldname].field_name = field_name

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Ejemplo n.º 23
0
from tempfile import TemporaryDirectory

from whoosh import qparser
from whoosh.analysis import RegexTokenizer
from whoosh.util.text import rcompile

tokenizer = RegexTokenizer(expression=rcompile(r"[\w/.]+"))
for token in tokenizer(u"Hello there templates/app1/test.html!"):
    print(repr(token.text))

from whoosh.fields import Schema, TEXT, ID
from whoosh.index import create_in

tmp_dir = TemporaryDirectory()

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
ix = create_in(tmp_dir.name, schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a",
                    content=u"this/is/a/test.html")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this/is/a/hello.html   hello a yup")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"this is a hello.html   hello a yup")
writer.commit()
from whoosh.qparser import QueryParser

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema)
Ejemplo n.º 24
0
    try:
        if ':' not in duration:
            seconds = int(duration)

            m, s = divmod(seconds, 60)
            h, m = divmod(m, 60)
            if h > 0:
                return "%02d:%02d:%02d" % (h, m, s)
            else:
                return "%02d:%02d" % (m, s)
    except ValueError:
        logger.warning(f"转换时间失败:`{duration}")

    return dest


# 搜索对象
whoosh_site_schema = Schema(
    id=ID(stored=True, unique=True),
    cname=TEXT(field_boost=5.0),
    author=TEXT(field_boost=3.0),
    brief=TEXT(),
)
whoosh_article_schema = Schema(
    uindex=ID(stored=True, unique=True),
    title=TEXT(field_boost=5.0),
    author=TEXT(field_boost=3.0),
    content=TEXT(),
)
Ejemplo n.º 25
0
class TargetSchema(SchemaClass):
    """Fultext index schema for target strings."""
    pk = NUMERIC(stored=True, unique=True)
    target = TEXT()
    comment = TEXT()
Ejemplo n.º 26
0
import os
import os.path

from whoosh import index, analysis, searching
from whoosh.fields import TEXT, Schema
from whoosh.qparser import QueryParser
from whoosh.reading import TermNotFound

import aai.query as query

SCHEMA = Schema(content=TEXT(stored=True, spelling=True))
INDEX_DIR = 'aai/indices'


def get_indices():
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
    if index.exists_in(INDEX_DIR):
        return index.open_dir(INDEX_DIR)
    else:
        return full_index()


def full_index():
    idx = index.create_in(INDEX_DIR, SCHEMA)
    writer = idx.writer()

    data = query.RDFQueries().artist_names()
    for item in data:
        writer.add_document(content=item)
Ejemplo n.º 27
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Ejemplo n.º 28
0
 def get_schema(self):
     return Schema(title=TEXT(stored=True),
                   path=ID(stored=True),
                   content=TEXT(stored=True))
Ejemplo n.º 29
0
"""Module for searching the toolshed repositories"""
from galaxy import exceptions
from galaxy.exceptions import ObjectNotFound
import logging
log = logging.getLogger( __name__ )

import whoosh.index
from whoosh import scoring
from whoosh.fields import Schema, STORED, TEXT
from whoosh.qparser import MultifieldParser

schema = Schema(
    id=STORED,
    name=TEXT( field_boost=1.7, stored=True ),
    description=TEXT( field_boost=1.5, stored=True ),
    long_description=TEXT( stored=True ),
    homepage_url=TEXT( stored=True ),
    remote_repository_url=TEXT( stored=True ),
    repo_owner_username=TEXT( stored=True ),
    times_downloaded=STORED,
    approved=STORED,
    last_updated=STORED,
    full_last_updated=STORED )


class RepoWeighting( scoring.BM25F ):
    """
    Affect the BM25G scoring model through the final method.
    source: https://groups.google.com/forum/#!msg/whoosh/1AKNbW8R_l8/XySW0OecH6gJ
    """
    use_final = True
Ejemplo n.º 30
0
# initialize a new index, using a HNSW index on Cosine Similarity
index_nms = nmslib.init(method='hnsw', space='cosinesimil')
index_nms.addDataPointBatch(data)
index_nms.createIndex({'post': 2}, print_progress=True)

print("Indexing whoosh...")

# http://jaympatel.com/2020/08/how-to-do-full-text-searching-in-python-using-whoosh-library/
from whoosh.fields import Schema, TEXT, ID
from whoosh import index
import os, os.path
from whoosh import index
from whoosh import qparser
from whoosh.qparser import QueryParser

schema = Schema(path=ID(stored=True), content=TEXT(stored=True))

#Now, we will use the schema to initialize a Whoosh index in the above directory.
ix = index.create_in("models", schema)
writer = ix.writer()

#Lastly, let us fill this index with the data from the dataframe.
for i in range(len(df_docs)):
    writer.add_document(content=str(df_docs.text.iloc[i]),
                        path=str(df_docs.id.iloc[i]))
writer.commit()


# https://stackoverflow.com/questions/19477319/whoosh-accessing-search-page-result-items-throws-readerclosed-exception
# http://annamarbut.blogspot.com/2018/08/whoosh-pandas-and-redshift-implementing.html
# https://ai.intelligentonlinetools.com/ml/search-text-documents-whoosh/