Beispiel #1
0
 def get_all_dirs_under_index_path(self):
     ''' returns the absolute path to all the index folders
     (optionally filtered by --if command line option)'''
     from digipal.utils import get_all_files_under
     ret = get_all_files_under(
         settings.SEARCH_INDEX_PATH, filters=self.get_filtered_indexes())
     return ret
Beispiel #2
0
 def get_all_dirs_under_index_path(self):
     ''' returns the absolute path to all the index folders
     (optionally filtered by --if command line option)'''
     from digipal.utils import get_all_files_under
     ret = get_all_files_under(settings.SEARCH_INDEX_PATH,
                               filters=self.get_filtered_indexes())
     return ret
Beispiel #3
0
    def get_index_info(self, path):
        ret = {'date': 0, 'size': 0, 'fields': [],
               'entries': '?', 'segments': []}

        # basic filesystem info
        from digipal.utils import get_all_files_under
        for file in get_all_files_under(path, file_types='f'):
            ret['size'] += os.path.getsize(file)
            ret['date'] = max(ret['date'], os.path.getmtime(file))

        # whoosh info
        import whoosh
        from whoosh.index import open_dir
        index = None
        try:
            index = open_dir(path)
        except whoosh.index.EmptyIndexError:
            pass

        query = self.options['qs']
        afield = self.options['field']

        if index:
            with index.searcher() as searcher:
                ret['entries'] = searcher.doc_count()
                for segment in index._segments():
                    ret['segments'].append(
                        {'id': segment.segid, 'entries': segment.doc_count()})
                for item in index.schema.items():
                    field_info = {
                        'name': item[0], 'type': item[1].__class__.__name__, 'range': [None, None]}
                    #values = list(searcher.lexicon(item[0]))
                    values = list(searcher.field_terms(item[0]))
                    #values_filtered = [v for v in values if repr(v) not in ['-2147483640L', '-2147483641L', '-2147483520L']]
                    values_filtered = values
                    if field_info['type'] == 'NUMERIC' and 'date' in field_info['name']:
                        values_filtered = [
                            v for v in values if v < 5000 and v > -5000]
                    if not values_filtered:
                        values_filtered = [0]
                    field_info['unique_values'] = len(list(values))
                    field_info['range'] = [repr(v)[0:12] for v in [
                        min(values_filtered), max(values_filtered)]]
                    ret['fields'].append(field_info)

                    if field_info['name'] == afield:
                        ret['values'] = sorted(list(set(values)))

                if query:
                    info = {}
                    ret['results'] = self.whoosh_search(
                        query, searcher, index, info)

        return ret
Beispiel #4
0
 def md2cms(self):
     from digipal.views import doc
     
     doc_slug = 'doc'
     self.update_cms_page(doc_slug, draft=True)
     
     for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True):
         print path
         info = doc.get_doc_from_md(utils.read_file(path))
         page = None
         if info:
             content = u'<div class="mddoc">%s</div>' % info['content']
             page = self.update_cms_page(info['title'], content, doc_slug)
         if page:
             print '  => # %s (%s)' % (page.id, page.slug)
Beispiel #5
0
 def md2cms(self):
     from digipal.views import doc
     
     doc_slug = 'doc'
     self.update_cms_page(doc_slug, draft=True)
     
     for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True):
         print path
         info = doc.get_doc_from_md(utils.read_file(path))
         page = None
         if info:
             content = u'<div class="mddoc">%s</div>' % info['content']
             page = self.update_cms_page(info['title'], content, doc_slug)
         if page:
             print '  => # %s (%s)' % (page.id, page.slug)
Beispiel #6
0
    def html2md(self):
        if len(self.args) < 2:
            print 'ERROR: missing path. Check help.'
            exit()
            
        path = self.args[1]
        
        from digipal.views import doc
        from django.utils.text import slugify

        for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True):
            info = doc.get_md_from_html(path)
            target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md'
            if 'confluence-workbox' in target:
                continue
            utils.write_file(target, info['md'])
            print '%s\n  => %s' % (path, target)
            for f in info['files']:
                print '   + %s' % f 
Beispiel #7
0
    def html2md(self):
        if len(self.args) < 2:
            print 'ERROR: missing path. Check help.'
            exit()
            
        path = self.args[1]
        
        from digipal.views import doc
        from django.utils.text import slugify

        for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True):
            info = doc.get_md_from_html(path)
            target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md'
            if 'confluence-workbox' in target:
                continue
            utils.write_file(target, info['md'])
            print '%s\n  => %s' % (path, target)
            for f in info['files']:
                print '   + %s' % f 
Beispiel #8
0
    context['running'] = context['indexing'] and context['indexing']['progress'] < 1.0
    now = datetime.now()
    if context['indexing'] and\
            (not context['running'] and ((now - context['indexing']['updated']).total_seconds() > (60 * 10))):
        context['indexing'] = None

    # read the index stats
    for ct in content_types:
        info = {'date': 0, 'size': 0}
        context['indexes'][ct.key] = {
            'object': ct,
            'info': info,
            'indexing': context['indexing']['indexes'].get(ct.key, None) if context['indexing'] else None,
        }

        for afile in get_all_files_under(ct.get_whoosh_index_path(), file_types='f'):
            info['size'] += os.path.getsize(afile)
            info['date'] = max(info['date'], os.path.getmtime(afile))

        info['date'] = datetime.fromtimestamp(info['date'])
        info['size'] = int(info['size'])

    context['title'] = 'Search Indexer'

    template = 'search/search_index.html'
    if request.is_ajax():
        template = 'search/search_index_fragment.html'

    ret = render_to_response(
        template, context, context_instance=RequestContext(request))
    return ret
Beispiel #9
0
def search_index_view(request):
    context = {"indexes": SortedDict()}

    """
    todo
    DONE reindex selected indexes in background
    . show when indexer is working
    . lock form (if working)
    . show last time indexer started (if working)
    . ajaxify
    . vue.js?
    """

    from digipal.views.faceted_search import faceted_search
    from digipal.utils import get_all_files_under
    from datetime import datetime
    from digipal.views.faceted_search.search_indexer import SearchIndexer

    indexer = SearchIndexer()

    content_types = faceted_search.get_types(True)

    # process request
    action = request.POST.get("action", "")
    reindexes = []
    if action == "reindex":
        for ct in content_types:
            if request.POST.get("select-%s" % ct.key):
                reindexes.append(ct.key)

        if reindexes:
            dputils.call_management_command("dpsearch", "index_facets", **{"if": ",".join(reindexes)})
            context["indexing"] = indexer.get_state_initial(reindexes)
    if not "indexing" in context:
        context["indexing"] = indexer.read_state()

    context["running"] = context["indexing"] and context["indexing"]["progress"] < 1.0
    now = datetime.now()
    if context["indexing"] and (
        not context["running"] and ((now - context["indexing"]["updated"]).total_seconds() > (60 * 10))
    ):
        context["indexing"] = None

    # read the index stats
    for ct in content_types:
        info = {"date": 0, "size": 0}
        context["indexes"][ct.key] = {
            "object": ct,
            "info": info,
            "indexing": context["indexing"]["indexes"].get(ct.key, None) if context["indexing"] else None,
        }

        for afile in get_all_files_under(ct.get_whoosh_index_path(), file_types="f"):
            info["size"] += os.path.getsize(afile)
            info["date"] = max(info["date"], os.path.getmtime(afile))

        info["date"] = datetime.fromtimestamp(info["date"])
        info["size"] = int(info["size"])

    context["title"] = "Search Indexer"

    template = "search/search_index.html"
    if request.is_ajax():
        template = "search/search_index_fragment.html"

    ret = render_to_response(template, context, context_instance=RequestContext(request))
    return ret
Beispiel #10
0
    def get_index_info(self, path):
        ret = {
            'date': 0,
            'size': 0,
            'fields': [],
            'entries': '?',
            'segments': []
        }

        # basic filesystem info
        from digipal.utils import get_all_files_under
        for file in get_all_files_under(path, file_types='f'):
            ret['size'] += os.path.getsize(file)
            ret['date'] = max(ret['date'], os.path.getmtime(file))

        # whoosh info
        import whoosh
        from whoosh.index import open_dir
        index = None
        try:
            index = open_dir(path)
        except whoosh.index.EmptyIndexError:
            pass

        query = self.options['qs']
        afield = self.options['field']

        if index:
            with index.searcher() as searcher:
                ret['entries'] = searcher.doc_count()
                for segment in index._segments():
                    ret['segments'].append({
                        'id': segment.segid,
                        'entries': segment.doc_count()
                    })
                for item in index.schema.items():
                    field_info = {
                        'name': item[0],
                        'type': item[1].__class__.__name__,
                        'range': [None, None]
                    }
                    #values = list(searcher.lexicon(item[0]))
                    values = list(searcher.field_terms(item[0]))
                    #values_filtered = [v for v in values if repr(v) not in ['-2147483640L', '-2147483641L', '-2147483520L']]
                    values_filtered = values
                    if field_info['type'] == 'NUMERIC' and 'date' in field_info[
                            'name']:
                        values_filtered = [
                            v for v in values if v < 5000 and v > -5000
                        ]
                    if not values_filtered:
                        values_filtered = [0]
                    field_info['unique_values'] = len(list(values))
                    field_info['range'] = [
                        repr(v)[0:12]
                        for v in [min(values_filtered),
                                  max(values_filtered)]
                    ]
                    ret['fields'].append(field_info)

                    if field_info['name'] == afield:
                        ret['values'] = sorted(list(set(values)))

                if query:
                    info = {}
                    ret['results'] = self.whoosh_search(
                        query, searcher, index, info)

        return ret
Beispiel #11
0
        context['indexing'] = None

    # read the index stats
    for ct in content_types:
        info = {'date': 0, 'size': 0}
        context['indexes'][ct.key] = {
            'object':
            ct,
            'info':
            info,
            'indexing':
            context['indexing']['indexes'].get(ct.key, None)
            if context['indexing'] else None,
        }

        for afile in get_all_files_under(ct.get_whoosh_index_path(),
                                         file_types='f'):
            info['size'] += os.path.getsize(afile)
            info['date'] = max(info['date'], os.path.getmtime(afile))

        info['date'] = datetime.fromtimestamp(info['date'])
        info['size'] = int(info['size'])

    context['title'] = 'Search Indexer'

    template = 'search/search_index.html'
    if request.is_ajax():
        template = 'search/search_index_fragment.html'

    ret = render_to_response(template,
                             context,
                             context_instance=RequestContext(request))