Ejemplo n.º 1
0
    def handle(self, *args, **options):
        # Optionally rebuild indices from scratch
        if options['clean']:
            create_source_index()
            for lang in Language.objects.have_translation():
                create_target_index(lang=lang.code)

        # Open writer
        source_writer = BufferedWriter(get_source_index())
        target_writers = {}

        try:
            # Process all units
            for unit in self.iterate_units(*args, **options):
                lang = unit.translation.language.code
                # Lazy open writer
                if lang not in target_writers:
                    target_writers[lang] = BufferedWriter(
                        get_target_index(lang)
                    )
                # Update target index
                if unit.translation:
                    update_target_unit_index(target_writers[lang], unit)
                # Update source index
                update_source_unit_index(source_writer, unit)

        finally:
            # Close all writers
            source_writer.close()
            for lang in target_writers:
                target_writers[lang].close()
Ejemplo n.º 2
0
def update_index(units, source_units=None):
    '''
    Updates fulltext index for given set of units.
    '''
    languages = Language.objects.all()

    # Default to same set for both updates
    if source_units is None:
        source_units = units

    # Update source index
    index = get_source_index()
    writer = BufferedWriter(index)
    try:
        for unit in source_units.iterator():
            update_source_unit_index(writer, unit)
    finally:
        writer.close()

    # Update per language indices
    for lang in languages:
        index = get_target_index(lang.code)
        writer = BufferedWriter(index)
        try:
            language_units = units.filter(translation__language=lang).exclude(
                target='')

            for unit in language_units.iterator():
                update_target_unit_index(writer, unit)
        finally:
            writer.close()
Ejemplo n.º 3
0
def update_index(units):
    """Update fulltext index for given set of units."""
    languages = Language.objects.have_translation()

    # Update source index
    if units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(translation__language=lang).exclude(
            target='')

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Ejemplo n.º 4
0
def main():
    queue = HotQueue(main_config.INDEX_QUEUE, 
                     host=main_config.REDIS_HOST, 
                     port=main_config.REDIS_PORT)
    index = get_index(main_config.WHOOSH_INDEX_DIR)
    writer = BufferedWriter(index, limit=10)
    try:
        for doc_id in queue.consume():
            print "looking at {}".format(doc_id)
            doc = Document.query.get(doc_id)
            if doc:
                write_doc(doc, writer)
            else:
                print "no doc with doc_id {}".format(doc_id)
    finally:
       writer.close()
Ejemplo n.º 5
0
 def source_writer(self, buffered=True):
     '''
     Returns source index writer (by default buffered).
     '''
     if not buffered:
         return self.source().writer()
     if self._source_writer is None:
         self._source_writer = BufferedWriter(self.source())
     return self._source_writer
Ejemplo n.º 6
0
 def target_writer(self, lang, buffered=True):
     '''
     Returns target index writer (by default buffered) for given language.
     '''
     if not buffered:
         return self.target(lang).writer()
     if not lang in self._target_writer:
         self._target_writer[lang] = BufferedWriter(self.target(lang))
     return self._target_writer[lang]
Ejemplo n.º 7
0
 def source_writer(self, buffered=True):
     '''
     Returns source index writer (by default buffered).
     '''
     if not buffered:
         return self.source().writer()
     if self._source_writer is None:
         self._source_writer = BufferedWriter(self.source())
     return self._source_writer
Ejemplo n.º 8
0
    def update_index(self, units):
        """Update fulltext index for given set of units."""

        # Update source index
        index = self.get_source_index()
        with BufferedWriter(index) as writer:
            for unit in units:
                self.update_source_unit_index(writer, unit)

        languages = set([unit['language'] for unit in units])

        # Update per language indices
        for language in languages:
            index = self.get_target_index(language)
            with BufferedWriter(index) as writer:
                for unit in units:
                    if unit['language'] != language:
                        continue
                    self.update_target_unit_index(writer, unit)
Ejemplo n.º 9
0
    def update_index(self, units):
        """Update fulltext index for given set of units."""
        languages = Language.objects.have_translation()

        # Update source index
        if units.exists():
            index = self.get_source_index()
            with BufferedWriter(index) as writer:
                for unit in units.iterator():
                    self.update_source_unit_index(writer, unit)

        # Update per language indices
        for lang in languages:
            language_units = units.filter(translation__language=lang).exclude(
                target='')

            if language_units.exists():
                index = self.get_target_index(lang.code)
                with BufferedWriter(index) as writer:
                    for unit in language_units.iterator():
                        self.update_target_unit_index(writer, unit)
Ejemplo n.º 10
0
def update_index(units, source_units=None):
    '''
    Updates fulltext index for given set of units.
    '''
    languages = Language.objects.have_translation()

    # Default to same set for both updates
    if source_units is None:
        source_units = units

    # Update source index
    index = get_source_index()
    writer = BufferedWriter(index)
    try:
        for unit in source_units.iterator():
            update_source_unit_index(writer, unit)
    finally:
        writer.close()

    # Update per language indices
    for lang in languages:
        index = get_target_index(lang.code)
        writer = BufferedWriter(index)
        try:
            language_units = units.filter(
                translation__language=lang
            ).exclude(
                target=''
            )

            for unit in language_units.iterator():
                update_target_unit_index(writer, unit)
        finally:
            writer.close()
Ejemplo n.º 11
0
def update_index(units):
    """Update fulltext index for given set of units."""
    languages = Language.objects.have_translation()

    # Update source index
    if units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(
            translation__language=lang
        ).exclude(
            target=''
        )

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Ejemplo n.º 12
0
 def __call__(self, pipeline):
     self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
     try:
         self.flush_thread.start()
         for event in pipeline:
             self.count += 1
             self._writer.add_document(source=unicode(event["source"]),
                                       name=unicode(event["index"]),
                                       raw=unicode(event["_raw"]),
                                       time=int(event.time),
                                       hash=unicode(event.hash))
     finally:
         self.flush()
Ejemplo n.º 13
0
def delete_search_units(source_units, languages):
    '''
    Delete fulltext index for given set of units.
    '''
    # Update source index
    if source_units:
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for pk in source_units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()

    for lang, units in languages.items():
        index = get_target_index(lang)
        writer = BufferedWriter(index)
        try:
            for pk in units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()
Ejemplo n.º 14
0
    def daemon(self):
        """
        Daemon function
        Continuosly watches self.directory for
        changes and adds them to the index.
        """
        ix = self.get_ix()
        writer = BufferedWriter(ix, limit=100)
        event_handler = IndexWriterEventHandler(writer, self, self.all,
                                                self.exclude, self.include,
                                                )
        observer = Observer()
        observer.schedule(event_handler, path=self.directory, recursive=True)
        observer.start()
        #observer.should_keep_running()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
            writer.commit()
            ix.close()
        observer.join()
Ejemplo n.º 15
0
def bootstrap_index(dirname=None, indexname=None):
    """
    Create spam index and add one post from the
    """
    if dirname and indexname:
        ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema())
    else:
        ix = init_spam_index()

    writer = BufferedWriter(ix)
    # Write text to index.
    index_writer(writer=writer, title="Placeholder",
                 content_length=0, is_spam=True,
                 content='CONTENT', uid=STARTER_UID)
    writer.commit()
    writer.close()

    return ix
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
Ejemplo n.º 17
0
class IndexWhoosh(Index):
    """Implements the whoosh engine as indexer."""

    query_hash = QueryParser("hash", schema=SCHEMA)

    def create(self):
        self._indexer = create_in(self.path, SCHEMA)
        self._index = self._indexer

    def open(self, ro=False):
        if os.path.isdir(self.path):
            self._index = open_dir(self.path)
            self._indexer = self._index
        else:
            os.mkdir(self.path)
            self.create()

        self._searcher = self._index.searcher()
        self._opened = True

    def set_metadata(self, name, value):
        with open(os.path.join(self.path,"metadata-%s" % name),"w") as f:
            f.write(pickle.dumps(value))

    def get_metadata(self, name, default=None):
        try:
            with open(os.path.join(self.path,"metadata-%s" % name)) as f:
                    return pickle.loads(f.read())
        except:
            return default

    def _timer(self):
        while True:
            self.flush()
            sleep(self.flush_time)

    def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs):
        self._opened = False
        self._index = None
        self._writer = None
        self.path = os.path.join(DEFAULT_INDEX_PATH, path)
        self.flush_time = flush_time
        self.flush_thread = threading.Thread(target=self._timer)
        self.open()
        self.count = 0

    def flush(self):
        if getattr(self, "callback_flush", None):
            self.callback_flush(self)

        if self._writer is not None:
            self._writer.commit()
            self.count = 0

    def is_indexed(self, hash):
        return self._searcher.search(self.query_hash.parse(unicode(hash))).estimated_length() > 0

    def __call__(self, pipeline):
        self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
        try:
            self.flush_thread.start()
            for event in pipeline:
                self.count += 1
                self._writer.add_document(source=unicode(event["source"]),
                                          name=unicode(event["index"]),
                                          raw=unicode(event["_raw"]),
                                          time=int(event.time),
                                          hash=unicode(event.hash))
        finally:
            self.flush()

    def search(self, expr, limit=10000):
        with self._index.searcher() as searcher:
            query = QueryParser("raw", self._index.schema)
            query.add_plugin(FieldsPlugin())
            query.add_plugin(RangePlugin())
            query.add_plugin(GtLtPlugin())
            query.add_plugin(WildcardPlugin())
            query = query.parse(expr)
            for x in searcher.search(query, limit=limit):
                yield x

    def __iter__(self):
        for x in self.search(u"*", None):
            yield x["raw"]
Ejemplo n.º 18
0
class Index(object):
    '''
    Class to manage index readers and writers.
    '''

    _source = None
    _target = {}
    _source_writer = None
    _target_writer = {}

    def source(self):
        '''
        Returns source index.
        '''
        if self._source is None:
            try:
                self._source = open_dir(
                    appsettings.WHOOSH_INDEX,
                    indexname='source'
                )
            except whoosh.index.EmptyIndexError:
                self._source = create_source_index()
            except IOError:
                # eg. path does not exist
                self._source = create_source_index()
        return self._source

    def target(self, lang):
        '''
        Returns target index for given language.
        '''
        if not lang in self._target:
            try:
                self._target[lang] = open_dir(
                    appsettings.WHOOSH_INDEX,
                    indexname='target-%s' % lang
                )
            except whoosh.index.EmptyIndexError:
                self._target[lang] = create_target_index(lang)
        return self._target[lang]

    def source_writer(self, buffered=True):
        '''
        Returns source index writer (by default buffered).
        '''
        if not buffered:
            return self.source().writer()
        if self._source_writer is None:
            self._source_writer = BufferedWriter(self.source())
        return self._source_writer

    def target_writer(self, lang, buffered=True):
        '''
        Returns target index writer (by default buffered) for given language.
        '''
        if not buffered:
            return self.target(lang).writer()
        if not lang in self._target_writer:
            self._target_writer[lang] = BufferedWriter(self.target(lang))
        return self._target_writer[lang]

    def source_searcher(self, buffered=True):
        '''
        Returns source index searcher (on buffered writer).
        '''
        if not buffered:
            return self.source().searcher()
        return self.source_writer(buffered).searcher()

    def target_searcher(self, lang, buffered=True):
        '''
        Returns target index searcher (on buffered writer) for given language.
        '''
        if not buffered:
            return self.target(lang).searcher()
        return self.target_writer(lang, buffered).searcher()

    def commit(self):
        '''
        Commits pending changes.
        '''
        self._source_writer.commit()
        for lang in self._target_writer:
            self._target_writer[lang].commit()
Ejemplo n.º 19
0
class IndexWhoosh(Index):
    """Implements the whoosh engine as indexer."""

    query_hash = QueryParser("hash", schema=SCHEMA)

    def create(self):
        self._indexer = create_in(self.path, SCHEMA)
        self._index = self._indexer

    def open(self, ro=False):
        if os.path.isdir(self.path):
            self._index = open_dir(self.path)
            self._indexer = self._index
        else:
            os.mkdir(self.path)
            self.create()

        self._searcher = self._index.searcher()
        self._opened = True

    def set_metadata(self, name, value):
        with open(os.path.join(self.path, "metadata-%s" % name), "w") as f:
            f.write(pickle.dumps(value))

    def get_metadata(self, name, default=None):
        try:
            with open(os.path.join(self.path, "metadata-%s" % name)) as f:
                return pickle.loads(f.read())
        except:
            return default

    def _timer(self):
        while True:
            self.flush()
            sleep(self.flush_time)

    def __init__(self,
                 path,
                 size=None,
                 rows=None,
                 flush_time=10,
                 *args,
                 **kwargs):
        self._opened = False
        self._index = None
        self._writer = None
        self.path = os.path.join(DEFAULT_INDEX_PATH, path)
        self.flush_time = flush_time
        self.flush_thread = threading.Thread(target=self._timer)
        self.open()
        self.count = 0

    def flush(self):
        if getattr(self, "callback_flush", None):
            self.callback_flush(self)

        if self._writer is not None:
            self._writer.commit()
            self.count = 0

    def is_indexed(self, hash):
        return self._searcher.search(self.query_hash.parse(
            unicode(hash))).estimated_length() > 0

    def __call__(self, pipeline):
        self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
        try:
            self.flush_thread.start()
            for event in pipeline:
                self.count += 1
                self._writer.add_document(source=unicode(event["source"]),
                                          name=unicode(event["index"]),
                                          raw=unicode(event["_raw"]),
                                          time=int(event.time),
                                          hash=unicode(event.hash))
        finally:
            self.flush()

    def search(self, expr, limit=10000):
        with self._index.searcher() as searcher:
            query = QueryParser("raw", self._index.schema)
            query.add_plugin(FieldsPlugin())
            query.add_plugin(RangePlugin())
            query.add_plugin(GtLtPlugin())
            query.add_plugin(WildcardPlugin())
            query = query.parse(expr)
            for x in searcher.search(query, limit=limit):
                yield x

    def __iter__(self):
        for x in self.search(u"*", None):
            yield x["raw"]
Ejemplo n.º 20
0
def test_classify(threshold=None,
                  niter=100,
                  limitmb=1024,
                  size=100,
                  verbosity=0):

    if threshold is None:
        threshold = settings.SPAM_THRESHOLD

    # Add posts to test spam index, then
    spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED))

    # Get the valid posts and shuffle.
    ham = Post.objects.valid_posts(author__profile__score__lte=0,
                                   type__in=[Post.ANSWER, Post.COMMENT])

    # Get list of id's for both
    spam = list(spam.values_list("id", flat=True))
    ham = list(ham.values_list("id", flat=True))

    # tp = Identify spam correctly.
    # tn = Identify valid post correctly.
    # fn = Missed to identify a spam.
    # fp = Mis-identified valid post as spam.
    tp, tn, fn, fp = 0, 0, 0, 0
    seen_ham, seen_spam = 0, 0
    elapsed, progress = util.timer_func()

    for i in range(niter):
        # Remove previous index
        if os.path.exists(TRAIN_DIR):
            shutil.rmtree(TRAIN_DIR)

        ix = search.init_index(
            dirname=TRAIN_DIR,
            indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}",
            schema=spam_schema())
        writer = BufferedWriter(ix,
                                limit=int((niter / 2) + 1),
                                writerargs=dict(limitmb=limitmb,
                                                multisegment=True))

        index_writer(writer=writer,
                     title="Placeholder",
                     content_length=0,
                     is_spam=True,
                     content='CONTENT',
                     uid=STARTER_UID)

        # Take one spam post out of training set.
        one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham)
        writer.commit()
        writer.close()
        post_score = compute_score(post=one_out, ix=ix)

        predicted_spam = post_score >= threshold
        is_spam = one_out.is_spam or one_out.is_deleted
        is_ham = not is_spam

        seen_spam += 1 if is_spam else 0
        seen_ham += 1 if is_ham else 0

        detail(is_spam=is_spam,
               predict=predicted_spam,
               post=one_out,
               verb=verbosity,
               post_score=post_score)

        if predicted_spam:
            tp += 1 if is_spam else 0
            fp += 1 if is_ham else 0

        else:
            fn += 1 if is_spam else 0
            tn += 1 if is_ham else 0

        progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}")

    train_spam = sizer(spam, size)
    train_ham = sizer(ham, size)
    print(f"... {train_ham + train_spam}\tSize of index ( per iteration )")
    print(f"... \t{train_spam}\tSPAM")
    print(f"... \t{train_ham}\tHAM")
    print(f"\n... {niter}\tNumber of iterations")
    report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn)

    return
Ejemplo n.º 21
0
            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)


if __name__ == '__main__':
    print('-' * 30)
    print('Muzeeglot data ingestion')
    print('-' * 30)
    if exists(configuration.INGESTION_LOCK):
        print('WARN: ingestion lock detected, pass')
    else:
        print('INFO: evaluate tags corpus')
        tags_corpus = get_tags_corpus()
        print('INFO: create search index')
        if not exists(configuration.INDEX):
            makedirs(configuration.INDEX)
        schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED())
        index = create_in(configuration.INDEX, schema)
        writer = BufferedWriter(index, period=60, limit=200)
        ingest_languages(writer)
        ingest_tags(tags_corpus)
        ingest_entities(tags_corpus, writer)
        print('INFO: optimize and close index')
        writer.close()
        index.optimize()
        index.close()
        print('INFO: write ingestion lock')
        with open(configuration.INGESTION_LOCK, 'w') as stream:
            stream.write('ingested')
Ejemplo n.º 22
0
 def update(self, note):
     writer = BufferedWriter(self.index, period=10, limit=10)
     writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.close()
Ejemplo n.º 23
0
        md = pd['metadata']
        directories = [x['path'] for x in md['contents'] if x['is_dir'] ]
        files = [x for x in md['contents'] if not x['is_dir'] ]
        for f in files:
            file_path = f['path']
            dir_part,ext = os.path.splitext(file_path.lower())
            if ext in ignore_extensions:
                print 'Ignoring file',file_path
                continue
            
            index_file(f,file_path,os.path.basename(file_path))

        for d in directories:
            index_path(d)

    writer = BufferedWriter(ix)
    def index_file(file_md,f,title):
        if not fnmatch.fnmatch(f,'*.*'): return

        indexed = False
        last_modified = None
        modified = None

        indexed_data = ds.get_document(id,f)

        if indexed_data:
            last_modified = indexed_data.get('modified')
        
        try:
            modified = parse_date(file_md['modified'])
            stale = (not last_modified) or ( (modified - last_modified).total_seconds() > 0 )
Ejemplo n.º 24
0
 def delete(self, note_id):
     writer = BufferedWriter(self.index)
     writer.delete_by_term('note_id', note_id)
     writer.close()
Ejemplo n.º 25
0
def delete_search_units(source_units, languages):
    '''
    Delete fulltext index for given set of units.
    '''
    # Update source index
    if source_units:
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for pk in source_units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()

    for lang, units in languages.items():
        if units:
            index = get_target_index(lang)
            writer = BufferedWriter(index)
            try:
                for pk in units:
                    writer.delete_by_term('pk', pk)
            finally:
                writer.close()