Exemple #1
0
def update_index(units):
    """Update fulltext index for given set of units."""
    languages = Language.objects.have_translation()

    # Update source index
    if units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(translation__language=lang).exclude(
            target='')

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Exemple #2
0
def update_index(units, source_units=None):
    '''
    Updates fulltext index for given set of units.
    '''
    languages = Language.objects.have_translation()

    # Default to same set for both updates
    if source_units is None:
        source_units = units

    # Update source index
    if source_units.exists():
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for unit in source_units.iterator():
                update_source_unit_index(writer, unit)
        finally:
            writer.close()

    # Update per language indices
    for lang in languages:
        language_units = units.filter(translation__language=lang).exclude(
            target='')

        if language_units.exists():
            index = get_target_index(lang.code)
            writer = BufferedWriter(index)
            try:

                for unit in language_units.iterator():
                    update_target_unit_index(writer, unit)
            finally:
                writer.close()
Exemple #3
0
    def handle(self, *args, **options):
        # Optionally rebuild indices from scratch
        if options['clean']:
            create_source_index()
            for lang in Language.objects.have_translation():
                create_target_index(lang=lang.code)

        # Open writer
        source_writer = BufferedWriter(get_source_index())
        target_writers = {}

        try:
            # Process all units
            for unit in self.iterate_units(*args, **options):
                lang = unit.translation.language.code
                # Lazy open writer
                if lang not in target_writers:
                    target_writers[lang] = BufferedWriter(
                        get_target_index(lang))
                # Update target index
                if unit.translation:
                    update_target_unit_index(target_writers[lang], unit)
                # Update source index
                update_source_unit_index(source_writer, unit)

        finally:
            # Close all writers
            source_writer.close()
            for lang in target_writers:
                target_writers[lang].close()
Exemple #4
0
    def daemon(self):
        """
        Daemon function
        Continuosly watches self.directory for
        changes and adds them to the index.
        """
        ix = self.get_ix()
        writer = BufferedWriter(ix, limit=100)
        event_handler = IndexWriterEventHandler(
            writer,
            self,
            self.all,
            self.exclude,
            self.include,
        )
        observer = Observer()
        observer.schedule(event_handler, path=self.directory, recursive=True)
        observer.start()
        #observer.should_keep_running()

        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
            writer.commit()
            ix.close()
        observer.join()
Exemple #5
0
 def source_writer(self, buffered=True):
     '''
     Returns source index writer (by default buffered).
     '''
     if not buffered:
         return self.source().writer()
     if self._source_writer is None:
         self._source_writer = BufferedWriter(self.source())
     return self._source_writer
Exemple #6
0
 def target_writer(self, lang, buffered=True):
     '''
     Returns target index writer (by default buffered) for given language.
     '''
     if not buffered:
         return self.target(lang).writer()
     if not lang in self._target_writer:
         self._target_writer[lang] = BufferedWriter(self.target(lang))
     return self._target_writer[lang]
Exemple #7
0
    def update_index(self, units):
        """Update fulltext index for given set of units."""

        # Update source index
        index = self.get_source_index()
        with BufferedWriter(index) as writer:
            for unit in units:
                self.update_source_unit_index(writer, unit)

        languages = set([unit['language'] for unit in units])

        # Update per language indices
        for language in languages:
            index = self.get_target_index(language)
            with BufferedWriter(index) as writer:
                for unit in units:
                    if unit['language'] != language:
                        continue
                    self.update_target_unit_index(writer, unit)
Exemple #8
0
    def update_index(self, units):
        """Update fulltext index for given set of units."""
        languages = Language.objects.have_translation()

        # Update source index
        if units.exists():
            index = self.get_source_index()
            with BufferedWriter(index) as writer:
                for unit in units.iterator():
                    self.update_source_unit_index(writer, unit)

        # Update per language indices
        for lang in languages:
            language_units = units.filter(translation__language=lang).exclude(
                target='')

            if language_units.exists():
                index = self.get_target_index(lang.code)
                with BufferedWriter(index) as writer:
                    for unit in language_units.iterator():
                        self.update_target_unit_index(writer, unit)
Exemple #9
0
 def __call__(self, pipeline):
     self._writer = BufferedWriter(self._indexer, period=10, limit=1000)
     try:
         self.flush_thread.start()
         for event in pipeline:
             self.count += 1
             self._writer.add_document(source=unicode(event["source"]),
                                       name=unicode(event["index"]),
                                       raw=unicode(event["_raw"]),
                                       time=int(event.time),
                                       hash=unicode(event.hash))
     finally:
         self.flush()
Exemple #10
0
def delete_search_units(source_units, languages):
    '''
    Delete fulltext index for given set of units.
    '''
    # Update source index
    if source_units:
        index = get_source_index()
        writer = BufferedWriter(index)
        try:
            for pk in source_units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()

    for lang, units in languages.items():
        index = get_target_index(lang)
        writer = BufferedWriter(index)
        try:
            for pk in units:
                writer.delete_by_term('pk', pk)
        finally:
            writer.close()
def bootstrap_index(dirname=None, indexname=None):
    """
    Create spam index and add one post from the
    """
    if dirname and indexname:
        ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema())
    else:
        ix = init_spam_index()

    writer = BufferedWriter(ix)
    # Write text to index.
    index_writer(writer=writer, title="Placeholder",
                 content_length=0, is_spam=True,
                 content='CONTENT', uid=STARTER_UID)
    writer.commit()
    writer.close()

    return ix
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
Exemple #13
0
def test_classify(threshold=None,
                  niter=100,
                  limitmb=1024,
                  size=100,
                  verbosity=0):

    if threshold is None:
        threshold = settings.SPAM_THRESHOLD

    # Add posts to test spam index, then
    spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED))

    # Get the valid posts and shuffle.
    ham = Post.objects.valid_posts(author__profile__score__lte=0,
                                   type__in=[Post.ANSWER, Post.COMMENT])

    # Get list of id's for both
    spam = list(spam.values_list("id", flat=True))
    ham = list(ham.values_list("id", flat=True))

    # tp = Identify spam correctly.
    # tn = Identify valid post correctly.
    # fn = Missed to identify a spam.
    # fp = Mis-identified valid post as spam.
    tp, tn, fn, fp = 0, 0, 0, 0
    seen_ham, seen_spam = 0, 0
    elapsed, progress = util.timer_func()

    for i in range(niter):
        # Remove previous index
        if os.path.exists(TRAIN_DIR):
            shutil.rmtree(TRAIN_DIR)

        ix = search.init_index(
            dirname=TRAIN_DIR,
            indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}",
            schema=spam_schema())
        writer = BufferedWriter(ix,
                                limit=int((niter / 2) + 1),
                                writerargs=dict(limitmb=limitmb,
                                                multisegment=True))

        index_writer(writer=writer,
                     title="Placeholder",
                     content_length=0,
                     is_spam=True,
                     content='CONTENT',
                     uid=STARTER_UID)

        # Take one spam post out of training set.
        one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham)
        writer.commit()
        writer.close()
        post_score = compute_score(post=one_out, ix=ix)

        predicted_spam = post_score >= threshold
        is_spam = one_out.is_spam or one_out.is_deleted
        is_ham = not is_spam

        seen_spam += 1 if is_spam else 0
        seen_ham += 1 if is_ham else 0

        detail(is_spam=is_spam,
               predict=predicted_spam,
               post=one_out,
               verb=verbosity,
               post_score=post_score)

        if predicted_spam:
            tp += 1 if is_spam else 0
            fp += 1 if is_ham else 0

        else:
            fn += 1 if is_spam else 0
            tn += 1 if is_ham else 0

        progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}")

    train_spam = sizer(spam, size)
    train_ham = sizer(ham, size)
    print(f"... {train_ham + train_spam}\tSize of index ( per iteration )")
    print(f"... \t{train_spam}\tSPAM")
    print(f"... \t{train_ham}\tHAM")
    print(f"\n... {niter}\tNumber of iterations")
    report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn)

    return
 def delete(self, note_id):
     writer = BufferedWriter(self.index)
     writer.delete_by_term('note_id', note_id)
     writer.close()
 def update(self, note):
     writer = BufferedWriter(self.index, period=10, limit=10)
     writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet)
     writer.close()
Exemple #16
0
            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)


if __name__ == '__main__':
    print('-' * 30)
    print('Muzeeglot data ingestion')
    print('-' * 30)
    if exists(configuration.INGESTION_LOCK):
        print('WARN: ingestion lock detected, pass')
    else:
        print('INFO: evaluate tags corpus')
        tags_corpus = get_tags_corpus()
        print('INFO: create search index')
        if not exists(configuration.INDEX):
            makedirs(configuration.INDEX)
        schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED())
        index = create_in(configuration.INDEX, schema)
        writer = BufferedWriter(index, period=60, limit=200)
        ingest_languages(writer)
        ingest_tags(tags_corpus)
        ingest_entities(tags_corpus, writer)
        print('INFO: optimize and close index')
        writer.close()
        index.optimize()
        index.close()
        print('INFO: write ingestion lock')
        with open(configuration.INGESTION_LOCK, 'w') as stream:
            stream.write('ingested')