Example #1
0
    def new_field(self, field_name: str, field_data):
        """
        Add a new field. If the schema is not yet defined the writer will add the field_name inside the schema

        Args:
            field_name (str): Name of the new field
            field_data: Data to put into the field
        """
        if not self.__schema_defined:
            self.__writer.add_field(field_name,
                                    KEYWORD(stored=True, vector=Frequency()))
        self.__doc[field_name] = field_data
Example #2
0
def get_schema():
    return Schema(
        id=NUMERIC(stored=True, unique=True, numtype=int),
        title=TEXT(stored=True),
        content=TEXT(),
        correspondent=TEXT(stored=True),
        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
        type=TEXT(stored=True),
        created=DATETIME(stored=True, sortable=True),
        modified=DATETIME(stored=True, sortable=True),
        added=DATETIME(stored=True, sortable=True),
    )
Example #3
0
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True,
                                                                         field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored,
                                                                   field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
                                                                        stored=field_class.stored,
                                                                        field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                # 中文分词
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),
                                                                  field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))
Example #4
0
 def create_index(self):
     self.out("Creating directory %s" % self.INDEX)
     os.mkdir(self.INDEX)
     schema = Schema(
         id=ID(stored=True, unique=True),
         title=TEXT(stored=True),  # Title to show
         card=STORED,  # Object card
         content=TEXT,  # Searchable content
         tags=KEYWORD(stored=True, commas=True, scorable=True),
         url=STORED)
     self.out("Creating index directory")
     create_in(self.INDEX, schema)
Example #5
0
def almacenar_datos():

    # define el esquema de la información
    schem = Schema(nombre=TEXT(stored=True),
                   edad=NUMERIC(stored=True),
                   altura=NUMERIC(stored=True),
                   nacionalidad=KEYWORD(stored=True, commas=True),
                   pie=TEXT(stored=True),
                   posicion_principal=KEYWORD(stored=True, commas=True),
                   posicion_secundaria=KEYWORD(stored=True, commas=True),
                   valor=NUMERIC(stored=True),
                   equipo=TEXT(stored=True),
                   contrato=DATETIME(stored=True))

    # eliminamos el directorio del índice, si existe
    if os.path.exists("Index"):
        shutil.rmtree("Index")
    os.mkdir("Index")

    # creamos el índice
    ix = create_in("Index", schema=schem)
    # creamos un writer para poder añadir documentos al indice
    writer = ix.writer()
    i = 0
    lista = almacenar_datos_bs()
    for jugador in lista:
        # añade cada pelicula de la lista al índice
        writer.add_document(nombre=str(jugador[0]),
                            edad=jugador[1],
                            altura=float(jugador[2]),
                            nacionalidad=str(jugador[3]),
                            pie=str(jugador[4]),
                            posicion_principal=str(jugador[5]),
                            posicion_secundaria=str(jugador[6]),
                            valor=float(jugador[7]),
                            equipo=str(jugador[8]),
                            contrato=jugador[9])
        i += 1
    writer.commit()
    print("Se han indexado " + str(i) + " jugadores")
Example #6
0
 def _get_schema(self):
     stem_ana = StemmingAnalyzer()
     return Schema(
         list_name=ID(stored=True),
         message_id=ID(stored=True),
         sender=TEXT(field_boost=1.5),
         user_id=TEXT,
         subject=TEXT(field_boost=2.0, analyzer=stem_ana),
         content=TEXT(analyzer=stem_ana),
         date=DATETIME(),
         attachments=TEXT,
         tags=KEYWORD(commas=True, scorable=True),
     )
Example #7
0
class Schema(SchemaClass):

    #: The id of the job.
    id = ID(stored=True, unique=True)

    #: The title of the job.
    title = TEXT(analyzer=stemming_analyzer)

    #: The name of the company.
    company = TEXT(analyzer=stemming_analyzer)

    #: Location as a comma-separated string of city and country.
    location = KEYWORD(lowercase=True, scorable=True, commas=True)

    #: The type of job.
    job_type = TEXT(analyzer=stemming_analyzer)

    #: The job tags as a comma-separated string of tag slugs.
    tags = KEYWORD(lowercase=True, scorable=True, commas=True)

    #: When was this job created?
    created = DATETIME(sortable=True)
Example #8
0
def get_index(dirpath, verbose=False):
    try:
        return open_dir(dirpath)
    except:
        pass
    if verbose:
        print 'Creating new index in', dirpath
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)
    schema = Schema(name=ID(stored=True, unique=True),
                    doc=TEXT(stored=True),
                    modulepath=KEYWORD(commas=True))
    return create_in(dirpath, schema)
Example #9
0
def index(corpusPath, name, tweetTime = None, stored = False, overwrite = True, procs = PROC_NUM, limitmb = MEM_SIZE):#, featureExtractor):
    """Indexing of the status of tweets."""
    
    dirList = os.listdir(corpusPath)
    
    schema = Schema(id = ID(stored = True, unique = True),
                    user = ID,
                    http = NUMERIC, # http state
                    date = DATETIME(stored = stored), # tweet date
                    status = TEXT(stored = stored), # status text of the tweet #TODO use a proper analyzer
                    hashtags = KEYWORD(stored = stored) # list of hashtags in the status
                    #replies = KEYWORD, # list of user replies in the status, as users
                    #vector = STORED
                    #score = NUMERIC(stored = True), # static score for ranking
                    #retweets = NUMERIC(type = type(1.), stored = True) # number of retweets of this tweet
                    ## next fields to fill on a second indexer pass ##
                    #retweets = KEYWORD, # list of retweets in the status, as tweet ids
                    #retweeteds = KEYWORD # list of tweets which retweet this tweet, as tweet ids
                    )

    indexPath = getIndexPath(name, tweetTime)
    if not os.path.exists(indexPath):
        os.makedirs(indexPath)
    else:
        if not overwrite:
            return
        shutil.rmtree(indexPath)
        os.makedirs(indexPath)
    ix = whoosh.index.create_in(indexPath, schema)
    writer = ix.writer(procs = PROC_NUM, limitmb = MEM_SIZE)
    
    for fName in dirList:
        #if tweetTime and dateFromFileName(fName) > tweetTime:
        #    continue
        #print fName
        for tweet in iterTweets(os.path.join(corpusPath, fName)):
            if tweetTime and int(tweet[0]) > tweetTime:
                continue
            if tweet[2] != '302': #and not 'RT @' in tweet[4]: # FIXME retweet filtering
                #v = featureExtractor(tweet[4].encode('ascii', 'replace'))
                writer.add_document(id = tweet[0],
                                    user = tweet[1],
                                    http = int(tweet[2]),
                                    date = tweet[3],
                                    status = tweet[4],
                                    hashtags = u' '.join(tweet[5])
                                    #replies = u' '.join(tweet[6]),
                                    #vector = repr(v)
                                    )
    
    writer.commit()
Example #10
0
    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
                   StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")
        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}
Example #11
0
    def test_update_tags(self, create_dir, pk, tags):
        if create_dir:
            os.makedirs(self._path)
            # Create schema
            schema = Schema(id=ID(stored=True, unique=True),
                            tags=KEYWORD(stored=True),
                            named_tags=KEYWORD(stored=True))
            # Create index
            index = create_in(self._path, schema)
            index_writer = index.writer()
            index_writer.add_document(id=unicode(pk),
                                      tags=unicode('test1 test2'),
                                      named_tags=unicode('test1 test2'))
            index_writer.commit()

        offering = MagicMock()
        offering.pk = pk
        offering.save = MagicMock()

        tag_man = tag_manager.TagManager(index_path=self._path)
        tag_man.update_tags(offering, tags)

        self.assertEquals(offering.tags, tags)

        # Query the index
        index = open_dir(self._path)
        with index.searcher() as searcher:
            query = QueryParser('id', index.schema).parse(unicode(pk))
            val = searcher.search(query)
            self.assertEquals(len(val), 1)
            self.assertEquals(val[0]['id'], unicode(pk))
            ret_tags = val[0]['tags'].split(' ')
            self.assertEquals(len(tags), len(ret_tags))

            for t in tags:
                self.assertTrue(t in ret_tags)
Example #12
0
class _DefaultSearchSchema(SchemaClass):
    """General search schema."""

    object_key = ID(stored=True, unique=True)
    id = NUMERIC(bits=64, signed=False, stored=True)
    object_type = ID(stored=True)
    creator = ID(stored=True)
    owner = ID(stored=True)

    #: security index. This list roles and user/group ids allowed to *see* this
    #: content
    allowed_roles_and_users = KEYWORD(stored=True)

    #: tags indexing
    tag_ids = KEYWORD(stored=True)
    tag_text = TEXT(analyzer=accent_folder)

    # hierarchical index of ids path ('/' is the separator)
    parent_ids = FieldType(format=Existence(), analyzer=PathTokenizer(), stored=True)

    name = TEXT(stored=True, analyzer=accent_folder)
    slug = ID(stored=True)
    description = TEXT(stored=True, analyzer=accent_folder)
    text = TEXT(analyzer=accent_folder)
Example #13
0
def get_schema():
    # Whoosh schema - for ease of use match names with record keys used in gutenberg_rdf_parser
    # Spelling attribute will cause columns to be used as source of query correction suggestions

    # Analyzers can be used to provide fuzzy matches to searches.  However,
    # the side effect seems to be that it polutes the match streams so that
    # spelling suggestions are meaningless.

    return wf.Schema(textId=ID(unique=True, stored=True),
                     title=TEXT(stored=True, spelling=True),
                     creator=TEXT(stored=True, spelling=True),
                     contributor=TEXT(stored=True, spelling=True),
                     subject=KEYWORD,
                     language=KEYWORD(stored=True),
                     friendlytitle=TEXT,
                     category=STORED)
Example #14
0
def createIndexs(dirName):
    schema = Schema(id=NUMERIC(sortable=True),views=KEYWORD(stored=True), semtiment=TEXT(stored=True),content=TEXT(stored=True,analyzer=analyzer))

    if not os.path.exists(dirName):
        os.mkdir(dirName)
    ix = create_in(dirName, schema)
    dic={}
    for line in open('Test.csv'):
        id,content=line.split('\t')
        dic[id]=content
    writer = ix.writer()
    reader=csv.reader(open('result_bs.csv'))
    for id,view,sem in reader:
        
        writer.add_document(id=id,views=view,semtiment=sem,content=dic[id])
    p = writer.commit()
Example #15
0
    def __init__(self, model=None):
        if model:
            self.fields = model._meta.get_all_field_names()
            self.model = model

        self.fields = set(self.fields) - set(self.exclude)
        schema_options = {}

        for field in self.fields:
            if field == self.pk:
                schema_options[field] = ID(stored=True, unique=True)
            elif field in self.keywords:
                schema_options[field] = KEYWORD(stored=field in self.stored)
            else:
                schema_options[field] = TEXT(stored=field in self.stored)
        self.schema = Schema(**schema_options)
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True, commas=True, scorable=True)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=int)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=float)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True, analyzer=StemmingAnalyzer())

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))
Example #17
0
class HiveJobListing(SchemaClass):
  '''Class to store the details associated with each Hive job'''

  job_url = ID(stored=True)
  title = TEXT(stored=True,analyzer=QUERY_ANALYZER)
  owner = KEYWORD(stored=True)
  completion_time = DATETIME(stored=True)
  query = TEXT(stored=True,analyzer=QUERY_ANALYZER)

  def __init__(self):
    self.job_url = None
    self.title = None
    self.owner = None
    self.completion_time = None
    self.query = None

  def __str__(self):
    return 'Url: %s, Title: %s, Owner: %s, Time: %s, Query: %s...' % (
      self.job_url, self.title, self.owner, self.completion_time, self.query[0:10])
Example #18
0
 def _mail_schema(self):
     return Schema(
         ident=ID(stored=True, unique=True),
         sender=ID(stored=False),
         to=KEYWORD(stored=False, commas=True),
         cc=KEYWORD(stored=False, commas=True),
         bcc=KEYWORD(stored=False, commas=True),
         bounced=KEYWORD(stored=False, commas=True),
         subject=TEXT(stored=False),
         date=NUMERIC(stored=False, sortable=True, bits=64, signed=False),
         body=TEXT(stored=False),
         tag=KEYWORD(stored=True, commas=True),
         flags=KEYWORD(stored=True, commas=True),
         raw=TEXT(stored=False))
Example #19
0
def create_indexer(doc_directory, index_directory):
    my_analyzer = RegexTokenizer() | LowercaseFilter()
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=True, analyzer=my_analyzer),
                    summary=TEXT,
                    article=TEXT(analyzer=my_analyzer),
                    keywords=KEYWORD(stored=True, analyzer=my_analyzer),
                    date=DATETIME(stored=True),
                    path=TEXT(stored=True))

    if not os.path.exists(index_directory):
        os.mkdir(index_directory)
    ix = create_in(index_directory, schema)
    writer = ix.writer()

    nt = 0
    print("==============================")
    t1 = time.clock()
    for dirname, subdirs, files in os.walk(doc_directory):
        if (files != []):
            n = 0
            for filename in files:
                filename = os.path.join(dirname, filename)
                obj = load_json(filename)
                writer.add_document(id=obj['id'],
                                    title=obj['title'],
                                    summary=obj['summary'],
                                    article=obj['article'],
                                    keywords=obj['keywords'],
                                    date=obj['date'],
                                    path=filename)
                n += 1
            print("{}: {}".format(dirname, n))
            nt += n
    t2 = time.clock()
    print("==============================")
    print("Docs: {}, Time: {:.2f}s".format(nt, (t2 - t1)))
    print("Writing index...")
    writer.commit()
    t3 = time.clock()
    print("Total time: {:.2f}s".format(t3 - t1))
    print("==============================")
Example #20
0
 def __get_index_schema(self):
     """
     :return: ticket index schema
     """
     return Schema(status=ID(stored=True),
                   assignee_id=NUMERIC(stored=True),
                   via=ID(stored=True),
                   description=ID(stored=True),
                   tags=KEYWORD(stored=True, commas=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   created_at=ID(stored=True),
                   submitter_id=NUMERIC(stored=True),
                   priority=ID(stored=True),
                   due_at=ID(stored=True),
                   organization_id=NUMERIC(stored=True),
                   has_incidents=BOOLEAN(stored=True),
                   id=ID(stored=True),
                   type=ID(stored=True),
                   subject=ID(stored=True))
Example #21
0
 def _setup(self):
     self._redis = getattr(self, '_redis', None)
     if not self._redis:
         self._redis = redis(
         )  # XXX test cases won't get correctly unpicked because of this
     self.schema = Schema(content=NGRAMWORDS(stored=False))
     self.schema.add("object_id", ID(stored=True, unique=True))
     self.schema.add("entity_id", ID(stored=True, unique=True))
     self.schema.add('sha1', ID(stored=True, unique=True))
     for a in list(ATTRS.keys()):
         self.schema.add(a, KEYWORD())
     self.objects = self.xml_dict('objects')
     self.parts = self.json_dict('parts')
     self.storage = FileStorage(os.path.join(self._dir, self._name))
     try:
         self.index = self.storage.open_index(schema=self.schema)
     except BaseException as ex:
         log.warn(ex)
         self.storage.create()
         self.index = self.storage.create_index(self.schema)
         self._reindex()
Example #22
0
def get_schema():
    analyzer = StemmingAnalyzer(stoplist=STOP)
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    url=ID(stored=True),
                    content_length=NUMERIC(stored=True, sortable=True),
                    thread_votecount=NUMERIC(stored=True, sortable=True),
                    vote_count=NUMERIC(stored=True, sortable=True),
                    content=TEXT(stored=True, analyzer=analyzer, sortable=True),
                    tags=KEYWORD(stored=True, commas=True),
                    is_toplevel=BOOLEAN(stored=True),
                    lastedit_date=NUMERIC(stored=True, sortable=True),
                    rank=NUMERIC(stored=True, sortable=True),
                    author=TEXT(stored=True),
                    author_score=NUMERIC(stored=True, sortable=True),
                    author_handle=TEXT(stored=True),
                    author_uid=ID(stored=True),
                    author_url=ID(stored=True),
                    uid=ID(stored=True),
                    type=NUMERIC(stored=True, sortable=True),
                    type_display=TEXT(stored=True))
    return schema
Example #23
0
    def index_graph_description(self, index_name='graphs'):
        from whoosh.fields import TEXT, ID, NGRAM, NUMERIC, KEYWORD
        from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer
        from whoosh.analysis.filters import LowercaseFilter
        print 'Building %s index...' % index_name

        # build a single schema from the fields exposed by the different search
        # types
        print '\tSchema:'
        fields = {
            'gid': ID(stored=True),
            'description': KEYWORD(lowercase=True, scorable=True)
        }
        #fields = {'gid': ID(stored=True), 'description': TEXT(analyzer=SimpleAnalyzer(ur'[.\s]', True))}

        from whoosh.fields import Schema
        schema = Schema(**fields)

        # Create the index schema
        index = self.recreate_index(index_name, schema)

        # Add documents to the index
        print '\tWrite indexes:'
        writer = index.writer()
        c = 0
        from digipal.models import Graph
        for graph in Graph.objects.filter(
                graph_components__isnull=False).prefetch_related(
                    'graph_components', 'graph_components__component',
                    'graph_components__features').distinct():
            c += 1
            doc = {
                'gid': unicode(graph.id),
                'description': graph.get_serialised_description()
            }
            writer.add_document(**doc)

        print '\t\tIndex %d graphs' % c

        writer.commit()
Example #24
0
    def __init__(self, path, index):
        """Initializes the search engine.

        Args:
            path: Path to document root to index
            index: Path to where the index will be placed.
        """
        self.path = path
        self.index = index
        analyzer = NgramWordAnalyzer(2, 4)

        try:
            ix = whoosh.index.open_dir(self.index)
            ix.close()
            create_index = False  # index seems to be working fine
        except whoosh.index.EmptyIndexError:
            create_index = True

        if create_index:
            schema = Schema(
                name=TEXT(stored=True, analyzer=StemmingAnalyzer()),
                link=TEXT(stored=True),
                category=KEYWORD(stored=True,
                                 scorable=True,
                                 commas=True,
                                 analyzer=analyzer),
                description=TEXT(stored=True),
            )

            if not os.path.isdir(self.index):
                os.mkdir(self.index)

            print("Creating index %s" % os.path.relpath(self.index))
            with contextlib.closing(whoosh.index.create_in(self.index,
                                                           schema)) as ix:
                self._index(ix, self.path)

        print("Opening index %s" % self.index)
        self.ix = whoosh.index.open_dir(self.index)
Example #25
0
    def handle(self, *args, **kwargs):
        """ Creates the index iterating over all the pages of the site """
        schema = Schema(pk=NUMERIC(unique=True, stored=True),
                        title=TEXT,
                        summary=TEXT,
                        tags=KEYWORD(commas=True, scorable=True),
                        pub_date=DATETIME(sortable=True))

        if not os.path.exists(settings.INDEX):
            os.mkdir(settings.INDEX)

        ix = create_in(settings.INDEX, schema)
        writer = ix.writer()
        objects = Page.objects.all()
        for object in objects:
            tags = map(lambda x: x.title, object.tags.all())
            writer.add_document(title=object.title,
                                summary=object.summary,
                                tags=",".join(tags),
                                pk=object.pk,
                                pub_date=object.pub_date)
        writer.commit()
Example #26
0
 def __init__(self):
     self.indexDir = "./indexfile"
     if not os.path.exists(self.indexDir):
         os.mkdir(self.indexDir)
     self.schema = Schema(url=TEXT(stored=True,
                                   analyzer=StemmingAnalyzer()),
                          title=TEXT(stored=True,
                                     analyzer=ChineseAnalyzer()),
                          content=TEXT(stored=True,
                                       analyzer=ChineseAnalyzer()),
                          anchors=KEYWORD(stored=True, commas=True),
                          pageRank=NUMERIC(int,
                                           32,
                                           sortable=True,
                                           stored=True))
     self.exists = index.exists_in(self.indexDir, indexname="nkai")
     if self.exists:
         self.index = index.open_dir(self.indexDir, indexname="nkai")
     else:
         self.index = index.create_in(self.indexDir,
                                      schema=self.schema,
                                      indexname="nkai")
Example #27
0
def get_index(api, recreate=False, must_exist=False):
    index_dir = api.ftsindex
    if index_dir.exists():
        if recreate:
            rmtree(index_dir)  # pragma: no cover
    elif must_exist:
        raise ValueError('No whoosh index found at {0}.'.format(index_dir))

    if not index_dir.exists():
        index_dir.mkdir()
        schema = Schema(id=ID(stored=True),
                        provider=KEYWORD(stored=True),
                        authoryear=TEXT(stored=True),
                        title=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                        author=TEXT(stored=True),
                        year=TEXT(stored=True),
                        doctype=TEXT(stored=True),
                        lgcode=TEXT(stored=True),
                        body=TEXT(),
                        tags=KEYWORD)
        return index.create_in(index_dir.as_posix(), schema)
    return index.open_dir(index_dir.as_posix())
Example #28
0
def create_schema():
    schema = Schema(post_id=ID(stored=True),
                    condition=STORED,
                    accession=ID(stored=True, analyzer=myanalyzer),
                    description=TEXT(stored=True, analyzer=myanalyzer),
                    gene=KEYWORD(stored=True,
                                 scorable=True,
                                 commas=True,
                                 analyzer=myanalyzer),
                    fdr=TEXT(stored=True),
                    species=TEXT(stored=True),
                    mw=NUMERIC,
                    peptides=NUMERIC,
                    psm=NUMERIC,
                    uniq_peptides=NUMERIC,
                    abun_t1=STORED,
                    abun_t2=STORED,
                    abun_t3=STORED,
                    abun_t4=STORED,
                    abun_t5=STORED,
                    q_value=NUMERIC,
                    pep=NUMERIC)
    return schema
Example #29
0
 def __get_index_schema(self):
     """
     :return: user index schema
     """
     return Schema(id=NUMERIC(stored=True),
                   url=ID(stored=True),
                   external_id=ID(stored=True),
                   name=ID(stored=True),
                   alias=ID(stored=True),
                   created_at=ID(stored=True),
                   active=BOOLEAN(stored=True),
                   verified=BOOLEAN(stored=True),
                   shared=BOOLEAN(stored=True),
                   locale=ID(stored=True),
                   timezone=ID(stored=True),
                   last_login_at=ID(stored=True),
                   email=ID(stored=True),
                   phone=ID(stored=True),
                   signature=ID(stored=True),
                   organization_id=NUMERIC(stored=True),
                   tags=KEYWORD(stored=True, commas=True),
                   suspended=BOOLEAN(stored=True),
                   role=ID(stored=True))
Example #30
0
def cargar_correos():
    if not os.path.exists(dircorr):
        print "Error: no existe el directorio de documentos: " + dircorr
    else:
        if not os.path.exists(dirindexC):
            os.mkdir(dirindexC)

    schema = Schema(remitente=TEXT(stored=True),
                    destinatarios=KEYWORD(stored=True),
                    fecha=DATETIME,
                    asunto=TEXT(stored=True),
                    contenido=TEXT,
                    file=TEXT(stored=True))
    ixc = create_in(dirindexC, schema)
    writer = ixc.writer()
    i = 0
    for docname in os.listdir(dircorr):
        if not os.path.isdir(dircorr + docname):
            add_doc(writer, dircorr, docname)
            i += 1
    writer.commit()

    return i