Ejemplo n.º 1
0
def FancyAnalyzer(expression=r"\s+",
                  stoplist=STOP_WORDS,
                  minsize=2,
                  maxsize=None,
                  gaps=True,
                  splitwords=True,
                  splitnums=True,
                  mergewords=False,
                  mergenums=False):
    """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
    StopFilter.

    >>> ana = FancyAnalyzer()
    >>> [token.text for token in ana("Should I call getInt or get_real?")]
    ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return (RegexTokenizer(expression=expression, gaps=gaps)
            | IntraWordFilter(splitwords=splitwords,
                              splitnums=splitnums,
                              mergewords=mergewords,
                              mergenums=mergenums)
            | LowercaseFilter()
            | StopFilter(stoplist=stoplist, minsize=minsize))
Ejemplo n.º 2
0
def StandardAnalyzer(expression=default_pattern,
                     stoplist=STOP_WORDS,
                     minsize=2,
                     maxsize=None,
                     gaps=False):
    """Composes a RegexTokenizer with a LowercaseFilter and optional
    StopFilter.

    >>> ana = StandardAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["testing", "testing", "testing"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain
Ejemplo n.º 3
0
def StemmingAnalyzer(expression=default_pattern,
                     stoplist=STOP_WORDS,
                     minsize=2,
                     maxsize=None,
                     gaps=False,
                     stemfn=stem,
                     ignore=None,
                     cachesize=50000):
    """Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.

    >>> ana = StemmingAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["test", "test", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param ignore: a set of words to not stem.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use. Use None for no cache, or -1 for an unbounded cache.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(
        stemfn=stemfn, ignore=ignore, cachesize=cachesize)
Ejemplo n.º 4
0
def SimpleAnalyzer(expression=default_pattern, gaps=False):
    """Composes a RegexTokenizer with a LowercaseFilter.

    >>> ana = SimpleAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["hello", "there", "this", "is", "a", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
Ejemplo n.º 5
0
def LanguageAnalyzer(lang,
                     expression=default_pattern,
                     gaps=False,
                     cachesize=50000):
    """Configures a simple analyzer for the given language, with a
    LowercaseFilter, StopFilter, and StemFilter.

    >>> ana = LanguageAnalyzer("es")
    >>> [token.text for token in ana("Por el mar corren las liebres")]
    ['mar', 'corr', 'liebr']

    The list of available languages is in `whoosh.lang.languages`.
    You can use :func:`whoosh.lang.has_stemmer` and
    :func:`whoosh.lang.has_stopwords` to check if a given language has a
    stemming function and/or stop word list available.

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use.
    """

    from whoosh.lang import NoStemmer, NoStopWords

    # Make the start of the chain
    chain = (RegexTokenizer(expression=expression, gaps=gaps)
             | LowercaseFilter())

    # Add a stop word filter
    try:
        chain = chain | StopFilter(lang=lang)
    except NoStopWords:
        pass

    # Add a stemming filter
    try:
        chain = chain | StemFilter(lang=lang, cachesize=cachesize)
    except NoStemmer:
        pass

    return chain
Ejemplo n.º 6
0
"""Text utilities."""

# Instead of adding more dependencies, let's take advantage of Whoosh's text
# processing utilities.
from whoosh.analysis.filters import CharsetFilter, LowercaseFilter
from whoosh.analysis.tokenizers import IDTokenizer, RegexTokenizer
from whoosh.support.charset import accent_map

ID_NORMALIZATION_CHAIN = IDTokenizer() | CharsetFilter(
    accent_map) | LowercaseFilter()
SORT_NORMALIZATION_CHAIN = RegexTokenizer() | CharsetFilter(
    accent_map) | LowercaseFilter()
SLUGIFY_CHAIN = RegexTokenizer(r"[^_\W]+") | CharsetFilter(
    accent_map) | LowercaseFilter()


def id_normalize(text):
    return ' '.join([t.text for t in ID_NORMALIZATION_CHAIN(text)])


def sort_normalize(text):
    return ' '.join([t.text for t in SORT_NORMALIZATION_CHAIN(text)])


def slugify(text, delimiter='-'):
    return delimiter.join([t.text for t in SLUGIFY_CHAIN(text)])
Ejemplo n.º 7
0
def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
    """Deprecated, just use a RegexTokenizer directly.
    """

    return RegexTokenizer(expression=expression, gaps=gaps)
Ejemplo n.º 8
0
class ExamplesIndex(ExamplesStore):
    """An ExampleStore that also provides interfaces such as querying for nearest
    examples of a certain type.
    It effectively provides a higher level interface around an IndexBackendABC specifically
    related to the domain of AInix examples."""

    # TODO (DNGros): this tokenizer shouldn't really be here. Should not depend on whoosh
    x_tokenizer = RegexTokenizer() | LowercaseFilter()

    def __init__(self, type_context: TypeContext, backend: IndexBackendABC = None):
        super().__init__(type_context)
        scheme = self.get_scheme()
        self.parser = StringParser(type_context)
        self.backend = backend or WhooshIndexBackend(scheme)
        self.example_count = 0

    @staticmethod
    def get_scheme() -> 'IndexBackendScheme':
        return IndexBackendScheme(
            example_id=IndexBackendFields.NUMBER,
            xquery=IndexBackendFields.TEXT,
            ytext=IndexBackendFields.TEXT,
            xtype=IndexBackendFields.ID,
            ytype=IndexBackendFields.ID,
            yindexable=IndexBackendFields.SPACE_STORED_TEXT,
            y_set_id=IndexBackendFields.ID,
            weight=IndexBackendFields.ONE_INTERVAL_NUM,
            split=IndexBackendFields.ID
        )

    @staticmethod
    def get_default_ram_backend() -> 'IndexBackendABC':
        """Gets a default backend that does not
        if node is None:
        return Falsetouch any files and just
        keeps data in RAM"""
        return ainix_kernel.indexing.whooshbackend.WhooshIndexBackend(
            ExamplesIndex.get_scheme(), ram_only=True)

    def get_num_x_values(self) -> int:
        return self.backend.get_doc_count()

    def _get_yparsed_rep(self, y_string: str, y_type: str) -> str:
        ast = self.parser.create_parse_tree(y_string, y_type)
        return ast.indexable_repr()

    def add_example(self, example: XValue) -> None:
        self.backend.add_documents([attr.asdict(example)])
        self.example_count += 1

    def add_yset(
        self,
        x_values: List[str],
        y_values: List[str],
        x_type: str,
        y_type: str,
        y_preferences: List[float],
        splitter: DataSplits = DEFAULT_SPLITTER
    ) -> None:
        y_group = id_generator(size=10)
        for x in x_values:
            split = splitter.get_split_from_example(x, y_type)
            for y, weight in zip(y_values, y_preferences):
                new_example = XValue(self.example_count, x, y, x_type, y_type, weight, y_group,
                                     split=split.value,
                                     yindexable=self._get_yparsed_rep(y, y_type))
                self.add_example(new_example)

    def _dict_to_example(self, doc: Dict) -> XValue:
        """Takes the dictionary form of an object and returns an example object"""
        # make a copy of the dict so we can mutate alter its keys without
        # mutating the input dict (this might be overkill....)
        doc_copy = copy.deepcopy(doc)
        doc_copy['weight'] = float(doc_copy['weight'])
        doc_copy['example_id'] = int(doc_copy['example_id'])
        doc_copy['split'] = int(doc_copy['split'])
        return XValue(**doc_copy)

    def get_example_by_id(self, id: int) -> XValue:
        query = Term("example_id", id)
        hits = list(self.backend.query(query))
        assert len(hits) == 1
        return self._dict_to_example(hits[0].doc)

    # This code is not very relvant anymore.
    def get_nearest_examples(
        self,
        x_value: str,
        choose_type_name: str = None,
        filter_splits=None,
        max_results=10
    ) -> Generator[XValue, None, None]:
        """
        Args:
            filter_splits:
            x_value: a string to look for the most similar example to
            choose_type_name: By optionally specifying this value you may require
                that a specific type choice appears in the example. You could for example
                only look for the nearest examples where the example features a choice
                between a Program type.
            filter_splits: A tuple of DataSplits. If provided, only examples in one
                of the splits in the provided tuple will be returned
            max_results: The max number of examples to return

        Returns:
            A list of all examples that are potentially near the example, sorted
            in order where the 0th item is predicted to be nearest.
        """
        tokenized_x_value = (tok.text for tok in self.x_tokenizer(x_value))
        query = Or([Term("xquery", term,) for term in tokenized_x_value])
        if choose_type_name:
            y_type_indexable_rep = ast_components.indexable_repr_classify_type(choose_type_name)
            query &= Term("yindexable", y_type_indexable_rep)
        if filter_splits:
            query &= Or([Term("split", str(split.value)) for split in filter_splits])
        query_result = self.backend.query(query, max_results)
        yield from (self._dict_to_example(hit.doc) for hit in query_result)

    def get_all_x_values(
        self,
        filter_splits: Tuple[DataSplits, ...] = None
    ) -> Generator[XValue, None, None]:
        """Yields all examples in the index"""
        if filter_splits is None or len(filter_splits) == 0:
            query = Every()
        else:
            query = Or([Term("split", str(split.value)) for split in filter_splits])
        yield from (self._dict_to_example(hit.doc)
                    for hit in self.backend.query(query, max_results=None, score=False))

    def get_y_values_for_y_set(self, y_set_id: str) -> List[XValue]:
        query = Term("y_set_id", y_set_id)
        return [self._dict_to_example(hit.doc)
                for hit in self.backend.query(query, None, False)]
Ejemplo n.º 9
0
def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, **kwargs):
    if not tokenizer:
        tokenizer = RegexTokenizer()
    return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, **
                                                       kwargs)