Exemple #1
0
class CitySchema(SchemaClass):
    """ Whoosh schema for city search index """

    city = TEXT(stored=True)
    state = TEXT(stored=True)
    country = TEXT(stored=True)
    content = TEXT(analyzer=NgramWordAnalyzer(minsize=2), phrase=False)
Exemple #2
0
    def __init__(self,
                 minsize=2,
                 maxsize=4,
                 stored=False,
                 field_boost=1.0,
                 tokenizer=None,
                 at=None,
                 queryor=False):
        """
        :param minsize: The minimum length of the N-grams.
        :param maxsize: The maximum length of the N-grams.
        :param stored: Whether to store the value of this field with the
            document. Since this field type generally contains a lot of text,
            you should avoid storing it with the document unless you need to,
            for example to allow fast excerpts in the search results.
        :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer`
            used to break the text into words.
        :param at: if 'start', only takes N-grams from the start of the word.
            If 'end', only takes N-grams from the end. Otherwise the default
            is to take all N-grams from each word.
        :param queryor: if True, combine the N-grams with an Or query. The
            default is to combine N-grams with an And query.
        """

        self.analyzer = NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at)
        self.format = formats.Frequency(field_boost=field_boost)
        self.stored = stored
        self.queryor = queryor
Exemple #3
0
def get_whoosh_index(force_create=False):
  from whoosh.index import create_in, exists_in, open_dir
  from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
  from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer
  from whoosh.support.charset import accent_map

  analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
  ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4)

  schema = Schema(
    title     = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), 
    abstract  = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), 
    path      = ID(unique=True, stored=True), 
    authors   = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), 
    content   = TEXT(analyzer=analyzer, stored=True), 
    tags      = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), 
    status    = KEYWORD,
    classname = KEYWORD,
    typeahead = TEXT(spelling=True, stored=True, phrase=False)
  )
    
  if not os.path.exists(settings.WHOOSH_ROOT):
    os.mkdir(settings.WHOOSH_ROOT)
  
  if not exists_in(settings.WHOOSH_ROOT) or force_create:
    index = create_in(settings.WHOOSH_ROOT, schema)
  else:
    index = open_dir(settings.WHOOSH_ROOT)
  return index
Exemple #4
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    NgramMin = args.nMin
    NgramMax = args.nMax
    if NgramMax < NgramMin:
        NgramMax = NgramMin
        print('nMax cannot be less than nMin. Setting nMax to nMin')
    MSID_index_dir = args.outfolder
    Searchable = args.s
    MSID_CSV_fnames = args.infiles

    #   TBD: add cmdline flag to set/use a particular index
    if not os.path.exists(MSID_index_dir):
        print("Doesn't exist, creating directory %s" % MSID_index_dir)
        os.mkdir(MSID_index_dir)

    ## Indicate which fields to NGRAM-ize
    ## Create Empty Schema
    schema = Schema()
    ix = index.create_in(MSID_index_dir, schema)
    writer = ix.writer()
    writer.commit(mergetype=writing.CLEAR,
                  optimize=True)  # Erase the index to start from scratch
    writer = ix.writer()
    for cur_file in MSID_CSV_fnames:
        with open(cur_file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            fieldnames = reader.fieldnames
            for field in fieldnames:
                if not field in ix.schema.names():
                    if field in Searchable:  # NGRAM-ize the field
                        writer.add_field(
                            field,
                            TEXT(stored=True,
                                 analyzer=NgramWordAnalyzer(
                                     NgramMin,
                                     maxsize=NgramMax,
                                     tokenizer=RegexTokenizer()),
                                 phrase=False)
                        )  # May need to adjust size to allow for description
                    else:  # Just store raw text
                        writer.add_field(field, TEXT(stored=True))
        writer.commit(optimize=True)
        writer = ix.writer()
    idx_cnt = 0
    for cur_file in MSID_CSV_fnames:
        print('Indexing %s' % cur_file)
        with open(cur_file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                idx_cnt += 1
                writer.add_document(**row)
                last_row = row
                if idx_cnt % 1000 == 0:
                    print(idx_cnt)
    print('Indexing Done, committing changes to disk')
    writer.commit()
Exemple #5
0
    def __init__(self, path, index):
        """Initializes the search engine.

        Args:
            path: Path to document root to index
            index: Path to where the index will be placed.
        """
        self.path = path
        self.index = index
        analyzer = NgramWordAnalyzer(2, 4)

        try:
            ix = whoosh.index.open_dir(self.index)
            ix.close()
            create_index = False  # index seems to be working fine
        except whoosh.index.EmptyIndexError:
            create_index = True

        if create_index:
            schema = Schema(
                name=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                link=TEXT(stored=True),
                category=KEYWORD(stored=True,
                                 scorable=True,
                                 commas=True,
                                 analyzer=analyzer),
                description=TEXT(stored=True),
            )

            if not os.path.isdir(self.index):
                os.mkdir(self.index)

            print("Creating index %s" % os.path.relpath(self.index))
            with contextlib.closing(whoosh.index.create_in(self.index,
                                                           schema)) as ix:
                self._index(ix, self.path)

        print("Opening index %s" % self.index)
        self.ix = whoosh.index.open_dir(self.index)
Exemple #6
0
    return value


def parse_int(value):
    '''a failsafe integer parser'''
    if not value:
        return None
    elif isinstance(value, int):
        return value
    try:
        return int(value)
    except Exception as e:
        return None


ngram_analyzer = NgramWordAnalyzer(minsize=3)


class Organization(fields.SchemaClass):
    # numero_de_da : Numéro de la Déclaration d'Activité -
    numero_de_da = fields.ID(stored=True, unique=True)
    # form_total : Nombre de formateurs -
    form_total = fields.NUMERIC(stored=True)
    # da_siren : Numéro de SIREN de la structure -
    da_siren = fields.ID(stored=True, unique=True)
    # da_no_etab : Numéro d'établissement de la structure -
    da_no_etab = fields.ID(stored=True)
    # da_raison_sociale : Raison Sociale -
    da_raison_sociale = fields.TEXT(stored=True,
                                    analyzer=ngram_analyzer,
                                    phrase=False)