Ejemplo n.º 1
0
def whooshFunction(dirdocs):
    crearTxt(dirdocs)
    schema = Schema(categoria=TEXT(stored=True),
                    titulo=TEXT(stored=True),
                    enlace=ID(stored=True),
                    descripcion=TEXT(analyzer=StemmingAnalyzer()),
                    fecha=fields.DATETIME(stored=True))

    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    ix = index.create_in("indexdir", schema)
    writer = ix.writer()
    for docname in os.listdir(dirdocs):
        if not os.path.isdir(dirdocs + docname):
            fileobj = open(dirdocs + '\\' + docname, "r")
            cat = fileobj.readline().strip()
            tit = fileobj.readline().strip()
            enlc = fileobj.readline().strip()
            f = fileobj.readline().strip()
            fech = datetime.strptime(f, '%Y-%m-%d %H:%M:%S')
            descrp = fileobj.readline().strip()
            fileobj.close()

            writer.add_document(categoria=cat,
                                titulo=tit,
                                enlace=enlc,
                                descripcion=descrp,
                                fecha=fech)
    writer.commit()
Ejemplo n.º 2
0
def test_pickle_schema():
    from whoosh import analysis
    from whoosh.support.charset import accent_map
    from whoosh.compat import dumps

    freetext_analyzer = (analysis.StemmingAnalyzer()
                         | analysis.CharsetFilter(accent_map))

    schema = fields.Schema(path=fields.ID(stored=True, unique=True),
                           file_mtime=fields.DATETIME(stored=True),
                           name=fields.TEXT(stored=False, field_boost=2.0),
                           description=fields.TEXT(stored=False,
                                                   field_boost=1.5,
                                                   analyzer=freetext_analyzer),
                           content=fields.TEXT(analyzer=freetext_analyzer))

    # Try to make some sentences that will require stemming
    docs = [
        u"The rain in spain falls mainly in the plain",
        u"Plainly sitting on the plain",
        u"Imagine a greatly improved sentence here"
    ]

    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for doc in docs:
                w.add_document(description=doc, content=doc)

        assert dumps(schema, 2)

        with ix.reader() as r:
            assert dumps(r.schema, 2)
def test_highlight_daterange():
    from datetime import datetime

    schema = fields.Schema(id=fields.ID(unique=True, stored=True),
                           title=fields.TEXT(stored=True),
                           content=fields.TEXT(stored=True),
                           released=fields.DATETIME(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.update_document(
        id=u('1'),
        title=u('Life Aquatic'),
        content=u('A nautic film crew sets out to kill a gigantic shark.'),
        released=datetime(2004, 12, 25)
    )
    w.update_document(
        id=u('2'),
        title=u('Darjeeling Limited'),
        content=u('Three brothers meet in India for a life changing train ' +
                  'journey.'),
        released=datetime(2007, 10, 27)
    )
    w.commit()

    s = ix.searcher()
    r = s.search(Term('content', u('train')), terms=True)
    assert len(r) == 1
    assert r[0]["id"] == "2"
    assert (r[0].highlights("content")
            == 'for a life changing <b class="match term0">train</b> journey')

    r = s.search(DateRange('released', datetime(2007, 1, 1), None))
    assert len(r) == 1
    assert r[0].highlights("content") == ''
def borraCreaIndex():
    if os.path.exists("index"):
        shutil.rmtree("index")
    schema = Schema(remitente=KEYWORD(stored=True), 
                    destinatarios=KEYWORD(stored=True),
                    fecha=fields.DATETIME(stored=True),
                     asunto= KEYWORD(stored=True), cuerpo= TEXT(stored=True),
                     nombreFichero=KEYWORD(stored=True))
    if not os.path.exists("index"):
        os.mkdir("index")
        ix = create_in("index", schema)
    else:
        ix = open_dir("index")
    
    quedanFicheros=True
    numeroFichero=1
    writer = ix.writer()
    while quedanFicheros:
        try:
            fichero=open("Correos/"+str(numeroFichero)+".txt")
            texto=fichero.read()
            textoPorPartes=(texto.split("\n",4))
            fechaFormat=datetime.datetime.strptime(textoPorPartes[2].strip(),"%Y%m%d")
            
            writer.add_document(remitente=unicode(textoPorPartes[0]),
                                destinatarios=unicode(textoPorPartes[1]),
                                fecha=fechaFormat,asunto=unicode(textoPorPartes[3]),
                                cuerpo=unicode(textoPorPartes[4]),nombreFichero=unicode(str(numeroFichero)+".txt"))
            numeroFichero=numeroFichero+1
        except Exception as ex:
            print("No hay mas ficheros")
            writer.commit()
            quedanFicheros=False
Ejemplo n.º 5
0
    def create_index(self):
        if not os.path.exists("twitter_index"):
            os.mkdir("twitter_index")


        schema = fields.Schema(tweet_id=fields.TEXT(stored=True),
                                batch=fields.NUMERIC(stored=True),
                                content=fields.TEXT(stored=True),
                                posted=fields.DATETIME(stored=True),
                                owner_sn=fields.TEXT(stored=True),
                                owner_id=fields.TEXT(stored=True),
                                owner_name=fields.TEXT(stored=True),
                                isRT=fields.BOOLEAN(stored=True),
                                timesRT=fields.NUMERIC(stored=True),
                                timesFav= fields.NUMERIC(stored=True),
                                orig_timesRT=fields.NUMERIC(stored=True),
                                orig_timesFav= fields.NUMERIC(stored=True),
                                hashtags=fields.KEYWORD(stored=True),
                                orgnlTweet = fields.TEXT(stored=True),
                                mentions=fields.KEYWORD(stored=True),
                                media = fields.TEXT(stored=True),
                                url = fields.TEXT(stored=True),
                                liwc=fields.TEXT(stored=True))


        self.INDEX = index.create_in("twitter_index", schema, indexname="TWTTR")
        print("New searching index succesfully created")

        return self.INDEX
Ejemplo n.º 6
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Ejemplo n.º 7
0
        def whooshFunction(dirdocs):
            crearTxt(dirdocs)
            schema = Schema(titulo=TEXT(stored=True),
                            fecha=fields.DATETIME(stored=True),
                            enlace=TEXT(stored=True),
                            resumen=TEXT(stored=True))

            if not os.path.exists("indexdir"):
                os.mkdir("indexdir")

            ix = index.create_in("indexdir", schema)
            writer = ix.writer()
            for docname in os.listdir(dirdocs):
                if not os.path.isdir(dirdocs + docname):
                    fileobj = open(dirdocs + '\\' + docname, "r")
                    tit = fileobj.readline().strip()
                    f = fileobj.readline().strip()
                    fech = datetime.strptime(f, '%d/%m/%Y - %H:%M')
                    enl = fileobj.readline().strip()
                    res = fileobj.readline().strip()
                    fileobj.close()

                    writer.add_document(titulo=tit,
                                        fecha=fech,
                                        enlace=enl,
                                        resumen=res)
            writer.commit()
Ejemplo n.º 8
0
def get_schema(model, analyzer):
    schema = {}
    primary = None
    searchable = set(getattr(model, '__searchable__', []))

    for field in model.__table__.columns:
        # primary key id
        if field.primary_key:
            schema[field.name] = whoosh_fields.ID(stored=True,
                                                  unique=True,
                                                  sortable=True)
            primary = field.name

        if field.name not in searchable:
            continue

        # text types
        if isinstance(field.type, TEXT_TYPES):
            schema[field.name] = whoosh_fields.TEXT(analyzer=analyzer)

        elif isinstance(field.type, DATE_TYPES):
            is_unique = getattr(field, 'unique', False)
            schema[field.name] = whoosh_fields.DATETIME(unique=is_unique)

        elif isinstance(field.type, sql_types.Boolean):
            schema[field.name] = whoosh_fields.BOOLEAN()

        else:
            raise WhooshAlchemyError('cannot index column of type %s' %
                                     field.type)

    return whoosh_fields.Schema(**schema), primary
Ejemplo n.º 9
0
def test_bigsort():
    times = 30000
    dirname = "testindex"
    
    df = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=df)
    
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
    ix = index.create_in(dirname, schema)
    
    print("Writing...")
    t = now()
    w = ix.writer(limitmb=512)
    for i in xrange(times):
        dt = datetime.fromtimestamp(random.randint(15839593, 1294102139))
        w.add_document(id=text_type(i), date=dt)
    w.commit()
    print("Writing took ", now() - t)
    
    ix = index.open_dir(dirname)
    s = ix.searcher()
    q = query.Wildcard("id", "1?2*")
    
    t = now()
    x = list(df.sortable_values(s.reader(), "date"))
    print(now() - t, len(x))
    
    t = now()
    for y in x:
        p = list(s.postings("date", y).all_ids())
    print(now() - t)
    
    
    
    t = now()
    r = s.search(q, limit=25, sortedby="date", reverse=True)
    print("Search 1 took", now() - t)
    print("len=", r.scored_length())
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    from heapq import nlargest
    t = now()
    sf = s.stored_fields
    gen = ((sf(n)["date"], n) for n in q.docs(s))
    r = nlargest(25, gen)
    print(now() - t)
Ejemplo n.º 10
0
 def _init_schema():
     schema = fields.Schema()
     schema.add("id", fields.ID(unique=True, stored=True))
     schema.add("short_id", fields.ID(stored=True))
     schema.add("status", fields.ID(stored=True))
     schema.add("started", fields.DATETIME(stored=True))
     schema.add("stopped", fields.DATETIME(stored=True))
     schema.add("pkg_type", fields.ID(stored=True))
     schema.add("pkg_name", fields.ID(stored=True))
     schema.add("pkg_version", fields.ID(stored=True))
     schema.add("model_name", fields.ID(stored=True))
     schema.add("op_name", fields.ID(stored=True))
     schema.add("label", fields.TEXT(stored=True))
     schema.add("scalar_*", fields.NUMERIC(float, stored=True), glob=True)
     schema.add("flagi_*", fields.NUMERIC(int, stored=True), glob=True)
     schema.add("flagf_*", fields.NUMERIC(int, stored=True), glob=True)
     schema.add("flagb_*", fields.BOOLEAN(stored=True), glob=True)
     schema.add("flags_*", fields.ID(stored=True), glob=True)
     schema.add("priv_*", fields.STORED, glob=True)
     return schema
Ejemplo n.º 11
0
class WorkspaceSchema(fields.SchemaClass):

    id = fields.ID(stored=True, unique=True)
    owner = fields.TEXT(stored=True, spelling=True)
    name = fields.TEXT(stored=True, spelling=True)
    description = fields.NGRAM(stored=True, minsize=1, phrase=True)
    lastmodified = fields.DATETIME(stored=True)
    longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True)
    public = fields.BOOLEAN(stored=True)
    users = fields.KEYWORD(commas=True)
    groups = fields.KEYWORD(commas=True)
    shared = fields.BOOLEAN(stored=True)
Ejemplo n.º 12
0
    def test_query_schema_is_setup_correctly(self):
        # Given
        p = Project(name='test', path=self.root)

        # When
        p.scan()

        # Then
        schema = p._query_parser.schema
        items = schema.items()
        from whoosh import fields
        self.assertIn(('path', fields.TEXT()), items)
        self.assertIn(('ctime', fields.DATETIME()), items)
        self.assertIn(('completed', fields.BOOLEAN()), items)
        self.assertIn(('size', INT), items)
Ejemplo n.º 13
0
 def get_index(self):
     stem_ana = analysis.StemmingAnalyzer()
     schema = fields.Schema(
         id=fields.ID(unique=True),
         datetime=fields.DATETIME(sortable=True),
         reply=fields.BOOLEAN,
         retweet=fields.BOOLEAN,
         text=fields.TEXT(analyzer=stem_ana, stored=True)
     )
     index_dir = os.path.join(self.dir, "index")
     if os.path.exists(index_dir):
         self.index = index.open_dir(index_dir)
     else:
         os.mkdir(index_dir)
         self.index = index.create_in(index_dir, schema)
Ejemplo n.º 14
0
def test_nontext_update():
    schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(unique=True),
                           date=fields.DATETIME(unique=True))
    ix = RamStorage().create_index(schema)

    dt = datetime.now()
    w = ix.writer()
    for i in xrange(10):
        w.add_document(id=i, num=i, date=dt + timedelta(days=i))
    w.commit()

    w = ix.writer()
    w.update_document(num=8, id="a")
    w.update_document(num=2, id="b")
    w.update_document(num=4, id="c")
    w.update_document(date=dt + timedelta(days=5), id="d")
    w.update_document(date=dt + timedelta(days=1), id="e")
    w.update_document(date=dt + timedelta(days=7), id="f")
    w.commit()
class TweetSchema(fields.SchemaClass):
    id = fields.ID(stored=True, unique=True)
    url = fields.ID(stored=True, unique=True)

    text = fields.TEXT(stored=True)
    source = fields.TEXT(stored=True)

    reply = fields.BOOLEAN(stored=True)
    in_reply_to_id = fields.TEXT(stored=True)
    in_reply_to_name = fields.TEXT(stored=True)

    user_mentions = fields.KEYWORD(stored=True)
    hashtags = fields.KEYWORD(stored=True)
    urls = fields.KEYWORD(stored=True)

    geo = fields.BOOLEAN(stored=True)
    latitude = fields.NUMERIC(stored=True)
    longitude = fields.NUMERIC(stored=True)

    date = fields.DATETIME(stored=True)
Ejemplo n.º 16
0
 def _setup_index(self):
     schema = fields.Schema(path=fields.ID(stored=True),
                            content=fields.TEXT(stored=True),
                            date=fields.DATETIME(stored=True,
                                                 sortable=True))
     indexpath = os.path.join(fs.adirs.user_cache_dir, "index",
                              self.channel)
     if not os.path.exists(indexpath):
         os.makedirs(indexpath)
     ix = create_in(indexpath, schema)
     writer = ix.writer(procs=self.indexer_procs)
     for name in os.listdir(self.log_dir):
         if name.startswith(self.channel + ".") and name.endswith(".yaml"):
             c, date = self._fields_from_yaml(name)
             writer.add_document(path=name, content=c, date=date)
     writer.commit()
     self.last_index_update = time.time()
     self.ix = ix
     lc = LoopingCall(self.update_index)
     reactor.callFromThread(lc.start, 30, now=False)
Ejemplo n.º 17
0
def test_open_date_ranges():
    basedate = datetime(2011, 1, 24, 6, 25, 0, 0)
    domain = [basedate + timedelta(days=n) for n in xrange(-20, 20)]

    schema = fields.Schema(date=fields.DATETIME(stored=True))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for d in domain:
        w.add_document(date=d)
    w.commit()

    with ix.searcher() as s:
        # Without date parser
        qp = qparser.QueryParser("date", schema)
        q = qp.parse("[2011-01-10 to]")
        r = [hit["date"] for hit in s.search(q, limit=None)]
        assert len(r) > 0
        target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)]
        assert_equal(r, target)

        q = qp.parse("[to 2011-01-30]")
        r = [hit["date"] for hit in s.search(q, limit=None)]
        assert len(r) > 0
        target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)]
        assert_equal(r, target)

        # With date parser
        from whoosh.qparser.dateparse import DateParserPlugin
        qp.add_plugin(DateParserPlugin(basedate))

        q = qp.parse("[10 jan 2011 to]")
        r = [hit["date"] for hit in s.search(q, limit=None)]
        assert len(r) > 0
        target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)]
        assert_equal(r, target)

        q = qp.parse("[to 30 jan 2011]")
        r = [hit["date"] for hit in s.search(q, limit=None)]
        assert len(r) > 0
        target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)]
        assert_equal(r, target)
Ejemplo n.º 18
0
class PydocSchema(fields.SchemaClass):
    path = fields.STORED

    title = fields.TEXT(stored=True,
                        sortable=True,
                        spelling=True,
                        analyzer=ana)
    tgrams = fields.NGRAMWORDS

    content = fields.TEXT(spelling=True, analyzer=ana)

    chapter = fields.ID(sortable=True)

    size = fields.NUMERIC(sortable=True)
    rev = fields.NUMERIC(sortable=True)
    revised = fields.DATETIME(sortable=True)

    modref = fields.TEXT(analyzer=tech_ana, phrase=False)
    clsref = fields.TEXT(analyzer=tech_ana, phrase=False)
    funcref = fields.TEXT(analyzer=tech_ana, phrase=False)
    pep = fields.TEXT(analyzer=tech_ana, phrase=False)

    cls = fields.TEXT(analyzer=cls_ana)
    mod = fields.TEXT(analyzer=tech_ana, phrase=False)
Ejemplo n.º 19
0
def get_schema():
    return fields.Schema(titulo=fields.TEXT(stored=True), start=fields.DATETIME(stored=True), end=fields.DATETIME(stored=True),
                  descripcion=fields.TEXT(stored=True), categoria=fields.TEXT(stored=True))
Ejemplo n.º 20
0
import hashlib

import whoosh.fields as F

# This scheme defines the structure of a single knowhow snippet.
SCHEMA = F.Schema(
    # unique identifier
    id=F.ID(unique=True, stored=True),
    # a multi-valued analyzed field
    tag=F.KEYWORD(stored=True, field_boost=2.0),
    # the text content of the snippet
    content=F.TEXT(stored=True),
    # all searchable fields, for use as a default field
    text=F.TEXT(stored=False),
    # when the snippet was last modified
    updated=F.DATETIME(stored=True),
)

# Function to create a hasher object for generating id of a snippet.
IdGenerator = hashlib.sha256

# The number of hexadecimal characters in an id
ID_LENGTH = IdGenerator().digest_size * 2


def identifier(doc):
    """
    Generate a unique identifier based solely on the content of the document.

    This doesn't take tags or anything else into account, because the content
    is what really matters. This means that adding the same content with
Ejemplo n.º 21
0
    def open_index(self, index_folder, create_new=False):
        """
        Create a schema,
        and create/open a search index
        that lives on disk.
        """
        self.index_folder = index_folder
        if create_new:
            if os.path.exists(index_folder):
                shutil.rmtree(index_folder)
                print("deleted index folder: " + index_folder)

        if not os.path.exists(index_folder):
            os.mkdir(index_folder)

        exists = index.exists_in(index_folder)

        #stemming_analyzer = StemmingAnalyzer()
        stemming_analyzer = StemmingAnalyzer() | LowercaseFilter()
        #stemming_analyzer = StemmingAnalyzer() | LowercaseFilter() | StopFilter()

        
        # ------------------------------
        # This is where the search index's document schema
        # is defined.

        schema = Schema(
                id = fields.ID(stored=True, unique=True),
                kind = fields.ID(stored=True),

                created_time = fields.DATETIME(stored=True),
                modified_time = fields.DATETIME(stored=True),
                indexed_time = fields.DATETIME(stored=True),
                
                title = fields.TEXT(stored=True, field_boost=100.0),

                url = fields.ID(stored=True),
                
                mimetype = fields.TEXT(stored=True),

                owner_email = fields.ID(stored=True),
                owner_name = fields.TEXT(stored=True),

                # mainly for email threads, groups.io, hypothesis
                group = fields.ID(stored=True),

                repo_name = fields.TEXT(stored=True),
                repo_url = fields.ID(stored=True),
                github_user = fields.TEXT(stored=True),

                tags = fields.KEYWORD(commas=True,
                                      stored=True,
                                      lowercase=True),

                # comments only
                issue_title = fields.TEXT(stored=True, field_boost=100.0),
                issue_url = fields.ID(stored=True),

                content = fields.TEXT(stored=True, analyzer=stemming_analyzer)
        )


        # Now that we have a schema,
        # make an index!
        if not exists:
            self.ix = index.create_in(index_folder, schema)
        else:
            self.ix = index.open_dir(index_folder)
Ejemplo n.º 22
0
from whoosh import fields, index

from datetime import datetime
'''class whooshSCHEMA(fields.SchemaClass):
  title = fields.TEXT(stored=True,sortable=True)
  content =  fields.TEXT(spelling=True)
  date = fields.DATETIME(stored=True)
  summary = fields.STORED
  url=fields.ID(stored=True, unique=True))'''

WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True, sortable=True),
                              content=fields.TEXT(spelling=True),
                              date=fields.DATETIME(stored=True),
                              summary=fields.STORED,
                              url=fields.ID(stored=True, unique=True))

#To create an index basically you need a writer object
ix = index.create_in("index", schema=WHOOSH_SCHEMA)
writer = ix.writer()

writer.add_document(title="pycones 2017",
                    content="python conference",
                    date=datetime(2017, 9, 22),
                    summary="discovering python search engine",
                    url="http://pycones.es")

writer.add_document(title="python 2017",
                    content="pycones2017",
                    date=datetime(2017, 9, 22),
                    summary="discovering python search engine",
                    url="http://pycones.es")