Python SimpleAnalyzer Examples, whoosh.analysis.SimpleAnalyzer Python Examples

Example #1

0

Show file

def typeahead_whoosh_index(query):
    ix = get_whoosh_index()
    parser = MultifieldParser(['content', 'title'], ix.schema)
    a = analysis.SimpleAnalyzer()

    with ix.searcher() as searcher:
        corrector = searcher.corrector("typeahead")
        for token in [token.text for token in a(u'photographie de p')]:
            suggestionList = corrector.suggest(token, limit=10)
            print suggestionList

Example #2

0

Show file

File: test_analysis.py Project: CuteCha/dssm-theano

def test_url():
    sample = u("Visit http://bitbucket.org/mchaput/whoosh or " +
               "urn:isbn:5930502 or http://www.apple.com/.")

    anas = [analysis.SimpleAnalyzer(analysis.url_pattern),
            analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)]
    for ana in anas:
        ts = [t.text for t in ana(sample)]
        assert ts == [u('visit'), u('http://bitbucket.org/mchaput/whoosh'),
                      u('or'), u('urn:isbn:5930502'), u('or'),
                      u('http://www.apple.com/')]

Example #3

0

Show file

File: test_parsing.py Project: altamir-bricks/whooshing-opendata

def test_nonexistant_fieldnames():
    # Need an analyzer that won't mangle a URL
    a = analysis.SimpleAnalyzer("\\S+")
    schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a))

    qp = default.QueryParser("text", schema)
    q = qp.parse(u("id:/code http://localhost/"))
    assert q.__class__ == query.And
    assert q[0].__class__ == query.Term
    assert q[0].fieldname == "id"
    assert q[0].text == "/code"
    assert q[1].__class__ == query.Term
    assert q[1].fieldname == "text"
    assert q[1].text == "http://localhost/"

Example #4

0

Show file

File: __init__.py Project: Galaxyinternship/Galaxy

 def __init__(self, toolbox, index_help=True):
     """
     Create a searcher for `toolbox`.
     """
     self.schema = Schema(id=STORED,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.toolbox = toolbox
     self.build_index(index_help)

Example #5

0

Show file

 def __init__(self, toolbox, panel_view_id: str, index_dir: str, index_help: bool = True):
     self.schema = Schema(id=ID(stored=True, unique=True),
                          old_id=ID,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.index_dir = index_dir
     self.toolbox = toolbox
     self.panel_view_id = panel_view_id
     self.index = self._index_setup()

Example #6

0

Show file

def test_span_near2():
    ana = analysis.SimpleAnalyzer()
    schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True))
    st = RamStorage()
    ix = st.create_index(schema)
    w = ix.writer()
    w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh " +
                          "was made by Matt Chaput"))
    w.commit()

    nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5)
    nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4)

    with ix.searcher() as s:
        m = nq2.matcher(s)
        assert m.spans() == [spans.Span(1, 8)]

Example #7

0

Show file

 def __init__(self, toolbox, index_help=True):
     self.schema = Schema(id=STORED,
                          stub=KEYWORD,
                          name=TEXT(analyzer=analysis.SimpleAnalyzer()),
                          description=TEXT,
                          section=TEXT,
                          help=TEXT,
                          labels=KEYWORD)
     self.rex = analysis.RegexTokenizer()
     self.toolbox = toolbox
     self.storage, self.index = self._index_setup()
     # We keep track of how many times the tool index has been rebuilt.
     # We start at -1, so that after the first index the count is at 0,
     # which is the same as the toolbox reload count. This way we can skip
     # reindexing if the index count is equal to the toolbox reload count.
     self.index_count = -1

Example #8

0

Show file

File: test_searching.py Project: ws-os/oh-mainline

def test_phrase_order():
    tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer())
    schema = fields.Schema(text=tfield)
    storage = RamStorage()
    ix = storage.create_index(schema)

    writer = ix.writer()
    for ls in permutations(["ape", "bay", "can", "day"], 4):
        writer.add_document(text=u(" ").join(ls))
    writer.commit()

    with ix.searcher() as s:
        def result(q):
            r = s.search(q, limit=None, sortedby=None)
            return sorted([d['text'] for d in r])

        q = Phrase("text", ["bay", "can", "day"])
        assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])

Example #9

0

Show file

def get_index():
    global _ix

    if _ix is not None:
        return _ix

    charfield = fields.FieldType(formats.Characters(),
                                 analysis.SimpleAnalyzer(),
                                 scorable=True, stored=True)
    schema = fields.Schema(text=charfield)
    st = RamStorage()
    _ix = st.create_index(schema)

    w = _ix.writer()
    for ls in permutations(domain, 4):
        w.add_document(text=u(" ").join(ls), _stored_text=ls)
    w.commit()

    return _ix

Example #10

0

Show file

File: test_searching.py Project: ws-os/oh-mainline

def test_pos_scorer():
    ana = analysis.SimpleAnalyzer()
    schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, key=u("0 0 1 0 0 0"))
    w.add_document(id=1, key=u("0 0 0 1 0 0"))
    w.add_document(id=2, key=u("0 1 0 0 0 0"))
    w.commit()
    w = ix.writer()
    w.add_document(id=3, key=u("0 0 0 0 0 1"))
    w.add_document(id=4, key=u("1 0 0 0 0 0"))
    w.add_document(id=5, key=u("0 0 0 0 1 0"))
    w.commit(merge=False)

    def pos_score_fn(searcher, fieldname, text, matcher):
        poses = matcher.value_as("positions")
        return 1.0 / (poses[0] + 1)
    pos_weighting = scoring.FunctionWeighting(pos_score_fn)

    s = ix.searcher(weighting=pos_weighting)
    r = s.search(query.Term("key", "1"))
    assert_equal([hit["id"] for hit in r], [4, 2, 0, 1, 5, 3])

Example #11

0

Show file

]

# Directory that will contain index
dir_index_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\Index Cranfield'
dir_index_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\Index Time'

# For each analyzer
for schema_type in dir_specific:

    schema_type = '\\' + schema_type
    print(schema_type)

    # In this case we'll boost the title field by 1.5
    if schema_type == '\\Field Booster':

        selected_analyzer = analysis.SimpleAnalyzer()

        # Create a Schema
        schema = Schema(id=ID(stored=True),
                        title=TEXT(stored=False,
                                   analyzer=selected_analyzer,
                                   field_boost=2),
                        content=TEXT(stored=False, analyzer=selected_analyzer))

        # Create an empty-Index
        os.mkdir(dir_index_cran + schema_type)
        temp_dir_cran = dir_index_cran + schema_type
        create_in(
            temp_dir_cran, schema
        )  # --> in this case we won't create any schema for Time dataset since we won't be using 'title'

Example #12

0

Show file

File: whoosh_backend.py Project: tsanov/bloodhound

class WhooshBackend(Component):
    """
    Implements Whoosh SearchBackend interface
    """
    implements(ISearchBackend, ISystemInfoProvider)

    index_dir_setting = Option(
        BHSEARCH_CONFIG_SECTION,
        'whoosh_index_dir',
        default='whoosh_index',
        doc="""Relative path is resolved relatively to the
        directory of the environment.""",
        doc_domain='bhsearch')

    advanced_security = Option(
        BHSEARCH_CONFIG_SECTION,
        'advanced_security',
        default=False,
        doc="Check view permission for each document when retrieving results.",
        doc_domain='bhsearch')

    max_fragment_size = IntOption(
        BHSEARCH_CONFIG_SECTION,
        'max_fragment_size',
        default=240,
        doc="The maximum number of characters allowed in a fragment.",
        doc_domain='bhsearch')

    fragment_surround = IntOption(
        BHSEARCH_CONFIG_SECTION,
        'fragment_surround',
        default=60,
        doc="""The number of extra characters of context to add both before
        the first matched term and after the last matched term.""",
        doc_domain='bhsearch')

    #This is schema prototype. It will be changed later
    #TODO: add other fields support, add dynamic field support.
    #Schema must be driven by index participants
    SCHEMA = Schema(
        unique_id=ID(stored=True, unique=True),
        id=ID(stored=True),
        type=ID(stored=True),
        product=ID(stored=True),
        milestone=ID(stored=True),
        time=DATETIME(stored=True),
        due=DATETIME(stored=True),
        completed=DATETIME(stored=True),
        author=ID(stored=True),
        component=ID(stored=True),
        status=ID(stored=True),
        resolution=ID(stored=True),
        keywords=KEYWORD(scorable=True),
        summary=TEXT(stored=True,
                     analyzer=analysis.StandardAnalyzer(stoplist=None)),
        content=TEXT(stored=True,
                     analyzer=analysis.StandardAnalyzer(stoplist=None)),
        changes=TEXT(analyzer=analysis.StandardAnalyzer(stoplist=None)),
        owner=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()),
        repository=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()),
        revision=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()),
        message=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()),
        required_permission=ID(),
        name=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()),
        query_suggestion_basket=TEXT(analyzer=analysis.SimpleAnalyzer(),
                                     spelling=True),
        relations=KEYWORD(lowercase=True, commas=True),
    )

    def __init__(self):
        self.index_dir = self.index_dir_setting
        if not os.path.isabs(self.index_dir):
            self.index_dir = os.path.join(
                get_global_env(self.env).path, self.index_dir)
        if index.exists_in(self.index_dir):
            self.index = index.open_dir(self.index_dir)
        else:
            self.index = None

    # ISystemInfoProvider methods

    def get_system_info(self):
        yield 'Whoosh', whoosh.versionstring()

    # ISearchBackend methods

    def start_operation(self):
        return self._create_writer()

    def _create_writer(self):
        return AsyncWriter(self.index)

    def add_doc(self, doc, operation_context=None):
        """Add any type of  document index.

        The contents should be a dict with fields matching the search schema.
        The only required fields are type and id, everything else is optional.
        """
        writer = operation_context
        is_local_writer = False
        if writer is None:
            is_local_writer = True
            writer = self._create_writer()

        self._reformat_doc(doc)
        doc[UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
                                                doc["type"], doc["id"])
        self.log.debug("Doc to index: %s", doc)
        try:
            writer.update_document(**doc)
            if is_local_writer:
                writer.commit()
        except:
            if is_local_writer:
                writer.cancel()
            raise

    def _reformat_doc(self, doc):
        """
        Strings must be converted unicode format accepted by Whoosh.
        """
        for key, value in doc.items():
            if key is None:
                del doc[None]
            elif value is None:
                del doc[key]
            elif isinstance(value, basestring) and value == "":
                del doc[key]
            else:
                doc[key] = self._to_whoosh_format(value)

    def delete_doc(self, product, doc_type, doc_id, operation_context=None):
        unique_id = self._create_unique_id(product, doc_type, doc_id)
        self.log.debug('Removing document from the index: %s', unique_id)
        writer = operation_context
        is_local_writer = False
        if writer is None:
            is_local_writer = True
            writer = self._create_writer()
        try:
            writer.delete_by_term(UNIQUE_ID, unique_id)
            if is_local_writer:
                writer.commit()
        except:
            if is_local_writer:
                writer.cancel()
            raise

    def optimize(self):
        writer = AsyncWriter(self.index)
        writer.commit(optimize=True)

    def is_index_outdated(self):
        return self.index is None or not self.index.schema == self.SCHEMA

    def recreate_index(self):
        self.log.info('Creating Whoosh index in %s' % self.index_dir)
        self._make_dir_if_not_exists()
        self.index = index.create_in(self.index_dir, schema=self.SCHEMA)
        return self.index

    def query(self,
              query,
              query_string=None,
              sort=None,
              fields=None,
              filter=None,
              facets=None,
              pagenum=1,
              pagelen=20,
              highlight=False,
              highlight_fields=None,
              context=None):
        # pylint: disable=too-many-locals
        with self.index.searcher() as searcher:
            self._apply_advanced_security(searcher, context)

            highlight_fields = self._prepare_highlight_fields(
                highlight, highlight_fields)

            sortedby = self._prepare_sortedby(sort)

            #TODO: investigate how faceting is applied to multi-value fields
            #e.g. keywords. For now, just pass facets lit to Whoosh API
            #groupedby = self._prepare_groupedby(facets)
            groupedby = facets

            query_parameters = dict(
                query=query,
                pagenum=pagenum,
                pagelen=pagelen,
                sortedby=sortedby,
                groupedby=groupedby,
                maptype=whoosh.sorting.Count,
                filter=filter,
            )
            self.env.log.debug("Whoosh query to execute: %s", query_parameters)
            raw_page = searcher.search_page(**query_parameters)
            results = self._process_results(raw_page, fields, highlight_fields,
                                            query_parameters)
            if query_string is not None:
                c = searcher.correct_query(query, query_string)
                results.query_suggestion = c.string
            try:
                actual_query = unicode(query.simplify(searcher))
                results.debug['actual_query'] = actual_query
            # pylint: disable=bare-except
            except:
                # Simplify has a bug that causes it to fail sometimes.
                pass
        return results

    def _apply_advanced_security(self, searcher, context=None):
        if not self.advanced_security:
            return

        old_collector = searcher.collector
        security_processor = SecurityPreprocessor(self.env)

        def check_permission(doc):
            return security_processor.check_permission(doc, context)

        def collector(*args, **kwargs):
            c = old_collector(*args, **kwargs)
            if isinstance(c, FilterCollector):
                c = AdvancedFilterCollector(c.child, c.allow, c.restrict,
                                            check_permission)
            else:
                c = AdvancedFilterCollector(c, None, None, check_permission)
            return c

        searcher.collector = collector

    def _create_unique_id(self, product, doc_type, doc_id):
        if product:
            return u"%s:%s:%s" % (product, doc_type, doc_id)
        else:
            return u"%s:%s" % (doc_type, doc_id)

    def _to_whoosh_format(self, value):
        if isinstance(value, basestring):
            value = unicode(value)
        elif isinstance(value, datetime):
            value = self._convert_date_to_tz_naive_utc(value)
        return value

    def _convert_date_to_tz_naive_utc(self, value):
        """Convert datetime to naive utc datetime
        Whoosh can not read  from index datetime values passed from Trac with
        tzinfo=trac.util.datefmt.FixedOffset because of non-empty
        constructor of FixedOffset"""
        if value.tzinfo:
            utc_time = value.astimezone(utc)
            value = utc_time.replace(tzinfo=None)
        return value

    def _from_whoosh_format(self, value):
        if isinstance(value, datetime):
            value = utc.localize(value)
        return value

    def _prepare_groupedby(self, facets):
        if not facets:
            return None
        groupedby = whoosh.sorting.Facets()
        for facet_name in facets:
            groupedby.add_field(facet_name,
                                allow_overlap=True,
                                maptype=whoosh.sortingwhoosh.Count)
        return groupedby

    def _prepare_sortedby(self, sort):
        if not sort:
            return None
        sortedby = []
        for sort_instruction in sort:
            field = sort_instruction.field
            order = sort_instruction.order
            if field.lower() == SCORE:
                if self._is_desc(order):
                    #We can implement tis later by our own ScoreFacet with
                    # "score DESC" support
                    raise TracError(
                        "Whoosh does not support DESC score ordering.")
                sort_condition = whoosh.sorting.ScoreFacet()
            else:
                sort_condition = whoosh.sorting.FieldFacet(
                    field, reverse=self._is_desc(order))
            sortedby.append(sort_condition)
        return sortedby

    def _prepare_highlight_fields(self, highlight, highlight_fields):
        if not highlight:
            return ()

        if not highlight_fields:
            highlight_fields = self._all_highlightable_fields()

        return highlight_fields

    def _all_highlightable_fields(self):
        return [
            name for name, field in self.SCHEMA.items()
            if self._is_highlightable(field)
        ]

    def _is_highlightable(self, field):
        return not isinstance(field, whoosh.fields.DATETIME) and field.stored

    def _is_desc(self, order):
        return (order.lower() == DESC)

    def _process_results(self,
                         page,
                         fields,
                         highlight_fields,
                         search_parameters=None):
        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        """
        :type fields: iterator
        :type page: ResultsPage
        """
        results = QueryResult()
        results.hits = page.total
        results.total_page_count = page.pagecount
        results.page_number = page.pagenum
        results.offset = page.offset
        results.facets = self._load_facets(page)

        docs = []
        highlighting = []
        for retrieved_record in page:
            result_doc = self._process_record(fields, retrieved_record)
            docs.append(result_doc)

            result_highlights = self._create_highlights(
                highlight_fields, retrieved_record)
            highlighting.append(result_highlights)
        results.docs = docs
        results.highlighting = highlighting

        results.debug["search_parameters"] = search_parameters
        return results

    def _process_record(self, fields, retrieved_record):
        result_doc = dict()
        #add score field by default
        if not fields or SCORE in fields:
            score = retrieved_record.score
            result_doc[SCORE] = score

        if fields:
            for field in fields:
                if field in retrieved_record:
                    result_doc[field] = retrieved_record[field]
        else:
            for key, value in retrieved_record.items():
                result_doc[key] = value

        for key, value in result_doc.iteritems():
            result_doc[key] = self._from_whoosh_format(value)
        return result_doc

    def _load_facets(self, page):
        """This method can be also used by unit-tests"""
        non_paged_results = page.results
        facet_names = non_paged_results.facet_names()
        if not facet_names:
            return None
        facets_result = dict()
        for name in facet_names:
            facets_result[name] = non_paged_results.groups(name)
        return facets_result

    def _make_dir_if_not_exists(self):
        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

        if not os.access(self.index_dir, os.W_OK):
            raise TracError(
                "The path to Whoosh index '%s' is not writable for the\
                 current user." % self.index_dir)

    def _create_highlights(self, fields, record):
        result_highlights = dict()
        fragmenter = whoosh.highlight.ContextFragmenter(
            self.max_fragment_size,
            self.fragment_surround,
        )
        highlighter = whoosh.highlight.Highlighter(
            formatter=WhooshEmFormatter(), fragmenter=fragmenter)

        for field in fields:
            if field in record:
                highlighted = highlighter.highlight_hit(record, field)
            else:
                highlighted = ''
            result_highlights[field] = highlighted
        return result_highlights