Python QueryParser.QueryParserの例、whoosh.qparser.QueryParser.QueryParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: search_fulltext.py プロジェクト: CERT-hr/modified_cve-search

                       action='store_true',
                       help='lookup ranking of vulnerable configuration')
args = argParser.parse_args()

if not args.q and not args.l and not args.g and not args.m:
    argParser.print_help()
    exit(1)

if args.f or args.t:
    from lib import CVEs
    cves = CVEs.last(rankinglookup=args.r, namelookup=args.n)

if args.q:
    with ix.searcher() as searcher:
        if not args.o:
            query = QueryParser("content", ix.schema).parse(" ".join(args.q))
        else:
            query = QueryParser("content",
                                schema=ix.schema,
                                group=qparser.OrGroup).parse(" ".join(args.q))

        results = searcher.search(query, limit=None)
        for x in results:
            if not args.f:
                print(x['path'])
            else:
                print(
                    json.dumps(cves.getcve(x['path']),
                               sort_keys=True,
                               default=json_util.default))
            if args.t and not args.f:

コード例 #2

0

ファイルを表示

ファイル: Flask_search.py プロジェクト: unixtime/AIL-framework

def get_more_search_result():
    query = request.form['query']
    q = []
    q.append(query)
    page_offset = int(request.form['page_offset'])
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)

    path_array = []
    preview_array = []
    date_array = []
    size_array = []
    list_tags = []

    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse(" ".join(q))
        results = searcher.search_page(query, page_offset, num_elem_to_get)
        for x in results:
            path = x.items()[0][1]
            path = path.replace(PASTES_FOLDER, '', 1)
            path_array.append(path)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(
                content) > max_preview_char else len(content) - 1
            preview_array.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4] + '/' + curr_date[
                4:6] + '/' + curr_date[6:]
            date_array.append(curr_date)
            size_array.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:' + path)
            l_tags = []
            for tag in p_tags:
                complete_tag = tag
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append((tag, complete_tag))
            list_tags.append(l_tags)

        to_return = {}
        to_return["path_array"] = path_array
        to_return["preview_array"] = preview_array
        to_return["date_array"] = date_array
        to_return["size_array"] = size_array
        to_return["list_tags"] = list_tags
        to_return["bootstrap_label"] = bootstrap_label
        if len(path_array) < num_elem_to_get:  #pagelength
            to_return["moreData"] = False
        else:
            to_return["moreData"] = True

    return jsonify(to_return)

コード例 #3

0

ファイルを表示

ファイル: search_engine.py プロジェクト: benjamineavenson/CS454_Project

    def search(
            self,
            given_query='',  #search function
            in_query=[''],
            ex_query=[''],
            diets=[],
            allergies=[],
            page=1,
            ranking="BM25"):
        # These are only for parsing not for filling the results
        keys = [
            'name', 'ingredients', 'cautions', 'dietLabels', 'healthLabels'
        ]

        try:  #open the index
            index = open_dir('WhooshIndex')
        except Exception:
            self.index()  #make the index if it doesnt exist
            index = open_dir('WhooshIndex')

        if ranking == "TF-IDF":  #set the ranking algorithm
            ranking = scoring.TF_IDF()
        else:
            ranking = scoring.BM25F()

        with index.searcher(weighting=ranking) as searcher:
            # Universal all docs in case of None
            # because in the intersection the smaller
            # result will be returned
            parser = QueryParser('url', schema=index.schema)
            q = parser.parse('http OR https')
            all_docs = searcher.search(q, limit=None)
            # Creates an empty result for a filter and mask
            p = QueryParser('id', schema=index.schema)
            q = p.parse('')
            myMask = searcher.search(q, limit=None)
            myFilter = searcher.search(q, limit=None)

            # include query parsing
            if in_query != ['']:
                in_parser = QueryParser('ingredients', schema=index.schema)
                inFilter = searcher.search(q, limit=None)
                in_q = in_parser.parse(
                    in_query[0])  #get the first ingredient...
                in_r = searcher.search(in_q, limit=None)
                inFilter.extend(in_r)
                for q in in_query:
                    in_q = in_parser.parse(
                        q
                    )  #take the intersection of remaining docs with docs containing next ingredient
                    in_r = searcher.search(in_q, limit=None)
                    inFilter.filter(in_r)
                myFilter.extend(inFilter)

            # exclude query parsing
            if ex_query != ['']:
                ex_parser = QueryParser('ingredients', schema=index.schema)
                for q in ex_query:
                    ex_q = ex_parser.parse(q)
                    ex_r = searcher.search(ex_q, limit=None)
                    myMask.extend(ex_r)  #list of docs to mask

            # allergies query parsing
            if allergies != []:
                allergy_parser = QueryParser('cautions', schema=index.schema)
                for q in allergies:
                    allergy_q = allergy_parser.parse(q)
                    allergy_r = searcher.search(allergy_q, limit=None)
                    myMask.extend(allergy_r)  #list of docs to mask

            # diets query parsing
            if diets != []:
                p = QueryParser('id', schema=index.schema)
                q = p.parse('')
                dietFilter = searcher.search(q, limit=None)
                diet_parser = QueryParser('dietInfo', schema=index.schema)
                diet_q = diet_parser.parse(diets[0])
                diet_r = searcher.search(diet_q,
                                         limit=None)  #get the first diet
                dietFilter.extend(diet_r)
                for d in diets:
                    diet_q = diet_parser.parse(d)
                    diet_r = searcher.search(diet_q, limit=None)
                    dietFilter.filter(
                        diet_r
                    )  #take the intersection of whats already in the filter and the new docs to filter by

                if (
                        in_query == ['']
                ):  #if we had no ingredients filter, let the filter be the diet filter
                    myFilter.extend(dietFilter)
                else:
                    myFilter.filter(
                        dietFilter
                    )  #otherwise the filter is the intersection of our two filters
            # filtering results to get intersection
            # print(type(results))

            # Check if the filter is empty so we don't intersect nothing
            if (diets == [] and in_query == ['']):
                myFilter = all_docs
            elif myFilter.scored_length(
            ) == 0:  #if we filtered and got nothing, we should return nothing
                payload = {}
                payload_entries = list()
                payload['entries'] = payload_entries
                payload['total'] = 0
                return payload

            if given_query != '' and given_query != None:  #the actual search
                if given_query[0] == '"' and given_query[-1] == '"':
                    given_query = given_query[1:-1]
                    parser = MultifieldParser(keys, schema=index.schema)
                else:
                    parser = MultifieldParser(keys,
                                              schema=index.schema,
                                              group=OrGroup)
                query = parser.parse(given_query)
                results = searcher.search_page(query,
                                               page,
                                               filter=myFilter,
                                               mask=myMask)
            else:
                parser = QueryParser(
                    'url', schema=index.schema
                )  #if we arent given a query for the search, filter and mask all docs
                q = parser.parse('http OR https')
                results = searcher.search_page(q,
                                               page,
                                               filter=myFilter,
                                               mask=myMask)

            # Format results for returning
            payload = {}
            payload_entries = list()
            for x in results:
                payload_entries.append({
                    'name': x['name'],
                    'image': x['image'],
                    'id': x['id']
                })
            payload['entries'] = payload_entries
            payload['total'] = len(results)

        return payload

コード例 #4

0

ファイルを表示

 def filter_queryset(self, request, queryset, view):
     if ('parent' in request.query_params and
             request.query_params['parent'] == ''):
         # Empty string means query for null parent
         queryset = queryset.filter(parent=None)
     try:
         q = request.query_params['q']
     except KeyError:
         return queryset
     # Short-circuit some commonly used queries
     COMMON_QUERY_TO_ORM_FILTER = {
         'asset_type:block': {'asset_type': 'block'},
         'asset_type:question': {'asset_type': 'question'},
         'asset_type:survey': {'asset_type': 'survey'},
         'asset_type:question OR asset_type:block': {
             'asset_type__in': ('question', 'block')
         }
     }
     try:
         return queryset.filter(**COMMON_QUERY_TO_ORM_FILTER[q])
     except KeyError:
         # We don't know how to short-circuit this query; pass it along to
         # the search engine
         pass
     except FieldError:
         # The user passed a query we recognized as commonly-used, but the
         # field was invalid for the requested model
         return queryset.none()
     queryset_pks = list(queryset.values_list('pk', flat=True))
     if not len(queryset_pks):
         return queryset
     # 'q' means do a full-text search of the document fields, where the
     # critera are given in the Whoosh query language:
     # https://pythonhosted.org/Whoosh/querylang.html
     search_queryset = SearchQuerySet().models(queryset.model)
     search_backend = search_queryset.query.backend
     if not isinstance(search_backend, WhooshSearchBackend):
         raise NotImplementedError(
             'Only the Whoosh search engine is supported at this time')
     if not search_backend.setup_complete:
         search_backend.setup()
     # Parse the user's query
     user_query = QueryParser('text', search_backend.index.schema).parse(q)
     # Construct a query to restrict the search to the appropriate model
     filter_query = Term(DJANGO_CT, get_model_ct(queryset.model))
     # Does the search index for this model have a field that allows
     # filtering by permissions?
     haystack_index = haystack.connections[
         'default'].get_unified_index().get_index(queryset.model)
     if hasattr(haystack_index, 'users_granted_permission'):
         # Also restrict the search to records that the user can access
         filter_query &= Term(
             'users_granted_permission', request.user.username)
     with search_backend.index.searcher() as searcher:
         results = searcher.search(
             user_query,
             filter=filter_query,
             scored=False,
             sortedby=None,
             limit=None
         )
         if not results:
             # We got nothing; is the search index even valid?
             if not searcher.search(filter_query, limit=1):
                 # Thre's not a single entry in the search index for this
                 # model; assume the index is invalid and return the
                 # queryset untouched
                 return queryset
         pk_type = type(queryset_pks[0])
         results_pks = {
             # Coerce each `django_id` from unicode to the appropriate type,
             # usually `int`
             pk_type((x['django_id'])) for x in results
         }
     filter_pks = results_pks.intersection(queryset_pks)
     return queryset.filter(pk__in=filter_pks)

コード例 #5

0

ファイルを表示

            results = []
            resSetLocal = set()
            resSetTotal = set()

            if len(fields) > 0 and len(fields) == len(myquery):
                i = 0
                while i < len(fields):
                    # se è di tipo: publication.title
                    if "." in fields[i]:
                        if fields[i].split(".")[0] not in type and not fields[i].split(".")[0] in schemaFields:
                            type.append(fields[i].split(".")[0])
                        # if myquery[i][0] == "\"":
                        #     qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema)
                        # else:
                        #     qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema, group=whoosh.qparser.OrGroup)
                        qparser = QueryParser(fields[i].split(".")[1], schema=ix.schema)
                        query = qparser.parse(myquery[i])
                        resTmp = searcher.search(query, limit=resultLimiter)
                        runtime = resTmp.runtime
                        if len(resSetTotal) == 0:
                            for res in resTmp:
                                el = Hit(res)
                                resSetTotal.add(el)
                        else:
                            resSetLocal = set()
                            for res in resTmp:
                                el = Hit(res)
                                resSetLocal.add(el)
                        if len(resSetTotal) > 0 and len(resSetLocal) > 0:
                            set1 = set(x.dic["key"] for x in resSetTotal)
                            set2 = set(x.dic["key"] for x in resSetLocal)

コード例 #6

0

ファイルを表示

def resolveQuery(query):
    # Add definition here
    # text = query + "IT WORKS LOLOLOLOL!" # Replace with suitable query -> Value
    post = query
    similar = []
    gg = [i for i in Complete]
    # print(Text)
    for i in range(0, len(gg)):
        try:
            name = gg[i]
            X1 = Complete[gg[i]]
            # print(Text)
            vectorizer = Vectorizers[gg[i]]
            new_post_vec1 = vectorizer.transform([post])
            for j in range(0, 10):
                try:
                    dist = sp.linalg.norm((new_post_vec1 - X1[j]).toarray())
                    # print(dist)
                    #print(j)
                    Text[i][j].replace('\\n', '<br>')
                    Text[i][j].replace('\n', '<br>')
                    similar.append((dist, name, Text[i][j]))
                    print(dist)
                except Exception as e:
                    print(e)
                    break
        except Exception as e:
            print(e)
    similar = sorted(similar)
    gg = list()
    with index.searcher() as searcher:
        query = QueryParser("content", index.schema).parse(u'Holding Pattern')
        results = searcher.search(query)
        for result in results:
            print(result)
            f = dict(result)
            gg.append(
                dict({
                    'result-text':
                    ''.join(re.escape(f['content']).split('\\')).replace(
                        '\n', '<br>').replace('\t', '    '),
                    'result-image':
                    "asd",
                    'result-doc-link':
                    'google.com',
                    'result-doc-name':
                    f['filename'],
                    'result-modified-date':
                    '01-2-2019',
                    'result-id':
                    "11"
                }))
    pgk = gg
    #print(similar)
    gg = list()
    for i in range(0, len(similar)):
        gg.append(
            dict({
                'result-text':
                ''.join(re.escape(similar[i][-1]).split('\\')).replace(
                    '\n', '<br>').replace('\t', '    '),
                'result-image':
                "asd",
                'result-doc-link':
                'google.com',
                'result-doc-name':
                similar[i][1],
                'result-modified-date':
                '01-2-2019',
                'result-id':
                "123"
            }))
    #print(gg)
    gg = pgk + gg
    return gg

コード例 #7

0

ファイルを表示

                content=TEXT(stored=True, analyzer=ChineseAnalyzer()))

print("start loading content...")
if not os.path.exists("index2"):
    os.mkdir("index2")
    ix = create_in("index2", schema)
    read_corpus_and_process()
else:
    ix = open_dir("index2")

print('start loading query')

query_idx, query_list = read_query_and_process()

print("start 1 searching ...")
parser = QueryParser("content", ix.schema, group=qparser.OrGroup)
nparser = QueryParser("content", ix.schema, group=qparser.NotGroup)
rank_list = [[] for i in range(len(query_list))]
content_list = [[] for i in range(len(query_list))]
with ix.searcher(weighting=scoring.BM25F(B=0.9, K1=1.2)) as searcher:
    for i in range(len(query_list)):
        query = parser.parse(query_list[i])
        neg_query = parser.parse(' '.join(neg_words))
        if any(word in query_list[i] for word in neg_words):
            query = query | neg_query
        else:
            for word in neg_words:
                query = query | nparser.parse(word)
        print(query)
        results = searcher.search(query, limit=300)
        for hit in results:

コード例 #8

0

ファイルを表示

ファイル: script.py プロジェクト: sbabrass/sitesearcher

def search(domain):
    """ Search your indexed website. """

    # Look for index for requested domain
    try:
        ix = index.open_dir(
            os.path.expanduser('~/.sitesearcher/index'), indexname=domain)
    except (EmptyIndexError, OSError):
        click.echo("""
            No index was found for domain {0}.
            Use "sitesearcher indexer {0}" to create one and try again.
        """.format(domain), err=True)

        return
    searchterm = click.prompt('Please enter your search')
    parser = QueryParser("content", schema=ix.schema)
    with ix.searcher() as searcher:
        pagenum = 1
        # Paging for search results
        while pagenum > 0:
            results = searcher.search_page(parser.parse(searchterm), pagenum)
            results.results.formatter = ConsoleFormatter()
            if results.results.is_empty():
                click.echo("No results found!")
            else:
                click.echo("Search results:")
                click.echo()
            # Output all results for current page in nice readable format
            for result in results:
                click.echo("Result #{}".format(result.rank + 1))
                click.echo("URL: {}".format(result['url']))
                # As the site content is not stored locally
                # send a request to get the content of the search result url
                request = Request(
                    result['url'],
                    headers={'User-Agent': get_user_agent()}
                )
                response = urlopen(request).read()
                click.echo("Extract:")
                content = clean_response_body(response)
                # Provide console color highlighting for result
                snippet = result.highlights("content", text=content)
                snippet_parts = re.split(
                    '(<searchphrase>.*?</searchphrase>)',
                    snippet,
                    re.DOTALL
                )
                for snippet_part in snippet_parts:
                    if snippet_part.startswith('<searchphrase>'):
                        searchphrase = re.search(
                            '<searchphrase>(.*?)</searchphrase>',
                            snippet_part,
                            re.DOTALL
                        ).group(1)
                        click.echo(
                            click.style(searchphrase, fg='blue', bold=True),
                            nl=False
                        )
                    else:
                        click.echo(snippet_part, nl=False)
                click.echo('\n')
            # Handle pagination
            if results.pagenum < results.pagecount:
                click.echo(
                    'Press any key to see next result page or <ESC> to abort',
                    nl=False
                )
                char = click.getchar()
                click.echo()
                if char != u'\x1b':
                    pagenum += 1
                    continue
            pagenum = -1

コード例 #9

0

ファイルを表示

ファイル: Whoosh_Module.py プロジェクト: sxphealer/Search-Engine-System-For-NPU

 def search(self, keyword, limit=50):
     with self.index.searcher(closereader=False) as searcher:
         query = QueryParser("content", self.index.schema).parse(keyword)
         results = searcher.search(query, limit=limit)
     return results

コード例 #10

0

ファイルを表示

if args.t:
    xr = ix.searcher().reader()
    for x in xr.most_frequent_terms("content", number=500, prefix=''):
        print(x)
    exit(0)

if args.s:
    # By default, the index is not storing the vector of the document (Whoosh
    # document schema). It won't work if you don't change the schema of the
    # index for the content. It depends of your storage strategy.
    docnum = ix.searcher().document_number(path=args.s)
    r = ix.searcher().more_like(docnum, "content")
    for hit in r:
        print(hit["path"])
    exit(0)

if args.q is None:
    argParser.print_help()
    exit(1)

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse(" ".join(args.q))
    results = searcher.search(query, limit=None)
    for x in results:
        if args.f:
            print(readdoc(path=x.items()[0][1]))
        else:
            print(x.items()[0][1])
        print

コード例 #11

0

ファイルを表示

def find_unused_templates():
    start = time.perf_counter()
    print('Finding all unused templates...')
    print('  Getting global templates...')
    global_templates_files, global_templates = find_global_templates()
    print('   Done.\n  Getting app templates...')
    app_templates_files, app_templates = find_app_templates()
    print('   Done.')
    templates = global_templates + app_templates
    template_files = global_templates_files + app_templates_files
    # templates.sort()
    template_files.sort()

    print('  Getting python files...')
    py_files, pys = find_py_files()
    print('   Done.')
    all_files = py_files + template_files

    tl_count = [0 for t in templates]
    unused_templates = []

    print('  Creating Index', end='')
    tmp_dir = TemporaryDirectory()

    schema = Schema(
        title=TEXT(stored=True),
        path=ID(stored=True),
        content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+"))))
    ix = create_in(tmp_dir.name, schema)
    writer = ix.writer()

    for filename in all_files:
        print('.', end='', flush=True)
        with open(filename, 'r') as f:
            writer.add_document(title=filename,
                                path=filename,
                                content='/n'.join(f.readlines()))
    print('', flush=True)
    writer.commit()
    print('   Done.')

    print('  Searching through templates for references', end='', flush=True)
    with ix.searcher() as searcher:
        for count, template in enumerate(templates):
            print('.', end="", flush=True)
            query = QueryParser("content", ix.schema).parse(template)
            results = searcher.search(query)
            if len(results) < 1:
                unused_templates.append(template)
    print('', flush=True)
    print('   Done.')

    if not unused_templates:
        print('No unused templates found.')
    else:
        print('\nUnused templates:')
        for template in unused_templates:
            print(template)
    end = time.perf_counter()
    print('Finished in ' + str(end - start) + ' seconds.')
    return unused_templates

コード例 #12

0

ファイルを表示

writer.commit()


#############################################################################
"""            Perform queries on Whoosh indexed data                    """

# Write a python program that takes queries (you need to design the supported queries) 
# and search through the indexed archive using whoosh. A sample query to the program 
# can be: RT:yes, keywords returns all the retweets that are related to the keywords. 
# Your program should handle at least 4 queries ( of your choice) similar to the sample query.

from whoosh.query import Term, And, Or
from whoosh.qparser import QueryParser
searcher = index.searcher()

parser = QueryParser("strong_hashtags", index.schema)
parser.parse("FIFAWWC USA JPN")


# Query 1: Player search
query = And([Term("tweet_text","tobin"),Term("tweet_text","heath")])
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

# Query 2: Player search
query = And([Term("tweet_text","alex"),Term("tweet_text","morgan")])
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

コード例 #13

0

ファイルを表示

    def __writeDoc2(self, ix, path, occurrences):
        """
        Writes single documents into the Index.

        Receives the path to a single question directory and grabs the necessary pdf, LaTeX and python files to add
        to the Index. Also checks to see if the files are already in the Index before adding them.

        Parameters
        ----------
        arg1 : Index
            Index to add the documents to.
        arg2 : string
            Path to the question directory.
        arg3 : int
            Inicial number of occurrences for the file.

        Returns
        -------
        bool
            Value of the operation.

        """
        writer = ix.writer()
        dateNow = dt.now()
        pathPdf = path + "true_or_false_question.pdf"
        textPdf = self.__decodePdf(pathPdf)
        pathPy = path + "program.py"
        textPy = self.__decodePy(pathPy)
        pathTex = path + "true_or_false_question.tex"
        textTex = self.__decodeTex(pathTex)

        finalPdfText = analyzeText(textPdf)
        #         print(finalPdfText)
        flag = False

        with ix.searcher() as seacher:
            query = QueryParser("content", schema=ix.schema)
            parse = query.parse(finalPdfText)
            result = seacher.search(parse)
            flag = True
            if not result.is_empty():
                prevOc = result[0]["nOccurrences"]
                prevPath = result[0]["path"]
                prevDate = result[0]["date"]
                prevTags = result[0]["tags"]
                writer.update_document(
                    path=prevPath,
                    content=finalPdfText,
                    date=prevDate,
                    tags=prevTags,
                    nOccurrences=prevOc + 1,
                )
            else:
                writer.add_document(
                    path=pathPdf,
                    content=finalPdfText,
                    date=dateNow,
                    tags="pdf",
                    nOccurrences=1,
                )

            if textPy is not None:
                writer.add_document(path=pathPy,
                                    content=analyzeText(textPy),
                                    date=dateNow,
                                    tags="py")

            if textPy is not None:
                textTex += textPy
            writer.add_document(path=pathTex,
                                content=analyzeText(textTex),
                                date=dateNow,
                                tags="tex")

        writer.commit()

        return flag

コード例 #14

0

ファイルを表示

# -*- coding: utf-8 -*-
"""
Created on Tue Sep 30 18:04:51 2014

@author: dlmu__000
"""

from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
indexdir = "."
ix = create_in(indexdir, schema)
writer = ix.writer()
writer.add_document(title=u"First document", path=u"/a",
      content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",
      content=u"The second one is even more interesting!")
writer.commit()
from whoosh.qparser import QueryParser
with ix.searcher() as searcher:
  query = QueryParser("content", ix.schema).parse("first")
  results = searcher.search(query)
  print results[0]

コード例 #15

0

ファイルを表示

ファイル: test_search.py プロジェクト: existentializm/datausa-api

from whoosh.qparser import QueryParser
from whoosh import index, sorting, scoring
from whoosh import qparser, query
from config import SEARCH_INDEX_DIR
import math
import unittest
from datausa.attrs.search import do_search

ix = index.open_dir(SEARCH_INDEX_DIR)
qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup)

facet = sorting.FieldFacet("zvalue", reverse=True)
scores = sorting.ScoreFacet()

class TestStringMethods(unittest.TestCase):
  NY_IDS = ['31000US35620', '05000US36061', '04000US36', '16000US3651000']

  def test_extra_word(self):
      data,suggs,tries,my_vars = do_search("new york economy")
      self.assertTrue(data[0][0] in self.NY_IDS)

  def test_manhattan(self):
      data,suggs,tries,my_vars = do_search("manhattan")
      self.assertEqual(data[0][0], "05000US36061")

  def test_exact_match_begin(self):
      data,suggs,tries,my_vars = do_search("nome")
      self.assertEqual(data[0][0], '16000US0254920')

  def test_ny(self):
      data,suggs,tries,my_vars = do_search("new york")

コード例 #16

0

ファイルを表示

ファイル: indexSearchingBuilding.py プロジェクト: wyh454412805/SimilarProject

def get_des_vector():
    # nltk.download('stopwords')
    # nltk.download('punkt')
    # nltk.download('averaged_perceptron_tagger')
    db = pymysql.connect(host="localhost",
                         user="******",
                         passwd="123456",
                         db="project")
    cursor = db.cursor()
    cursor.execute("SELECT name,description,catalog from detail d")
    data = cursor.fetchall()

    # 这里必须把fetch回来的data转换为list的格式，否则DataFrame会在初始化的时候报错。

    data = list(data)
    data = [list(i) for i in data]

    df = DataFrame(data, columns=["A", "B", "C"])

    # print(df)

    df1 = df[['B']]
    # print(str(df))
    # print(df)
    doclist = df1.values
    # print(doclist)
    tempdoclist = []
    mydoclist = []

    # r = '[http]{4}\\:\\/\\/([a-zA-Z]|[0-9])*(\\.([a-zA-Z]|[0-9])*)*(\\/([a-zA-Z]|[0-9])*)*\\s?'
    # tags = set(['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS'])
    # for index in range(len(doclist)):
    #     text = str(doclist[index])
    #     text = ' '.join([word for word in text.split()])
    #     text = re.sub(r, ' ', text)
    #     words = nltk.word_tokenize(text)
    #     pos_tags = nltk.pos_tag(words)
    #     ret = ' '.join([word for word, pos in pos_tags if pos in tags])
    #     # print(ret)
    #     tempdoclist.append(ret)
    #
    # r1 = '[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、：:;；~@#￥%……&*（）0123456789]+'
    # cachedStopWords = stopwords.words("english")
    # for index in range(len(tempdoclist)):
    #     text = str(tempdoclist[index])
    #     text = ' '.join([word for word in text.split()])
    #     text = re.sub(r1, ' ', text)
    #     text = ' '.join([word for word in text.split() if word not in cachedStopWords])
    #     # print(doclist[index])
    #     # print(text)
    #     # mydoclist.extend(doclist[index])
    #     mydoclist.append(text)

    for i in range(len(doclist)):
        mydoclist.extend(doclist[i])
    print(len(mydoclist))
    print(mydoclist)

    df2 = df[['A']]
    namelist = df2.values
    mynamelist = []
    for i in range(len(namelist)):
        mynamelist.extend(namelist[i])
    print(len(mynamelist))
    print(mynamelist)

    df3 = df[['C']]
    cataloglist = df3.values
    mycataloglist = []
    for i in range(len(cataloglist)):
        mycataloglist.extend(cataloglist[i])
    print(len(mycataloglist))
    print(mycataloglist)

    schema = Schema(name=TEXT(stored=True),
                    description=TEXT(stored=True),
                    catalog=TEXT(stored=True))  # 创建索引结构
    ix = create_in("IndexSearching/index",
                   schema=schema,
                   indexname='indexname')  # path 为索引创建的地址，indexname为索引名称
    writer = ix.writer()
    for i in range(len(mydoclist)):
        writer.add_document(name=str(mynamelist[i]),
                            description=str(mydoclist[i]),
                            catalog=str(mycataloglist[i]))  # 此处为添加的内容
    print("建立完成一个索引")
    writer.commit()
    # 以上为建立索引的过程
    new_list = []
    index = open_dir("IndexSearching/index", indexname='indexname')  # 读取建立好的索引
    with index.searcher() as searcher:
        parser = QueryParser("description", index.schema)  #description,搜索域
        myquery = parser.parse("map OR internet OR GPS")
        results = searcher.search(myquery, limit=20)  # limit为搜索结果的限制，默认为10
        for result1 in results:
            print(dict(result1))
            new_list.append(dict(result1))

コード例 #17

0

ファイルを表示

    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)
ix = index.open_dir("indexdir")

# Indexation des documents
writer = ix.writer()
writer.add_document(Name=u"Super Mario World (USA)",
                    Title=u"Super Mario World (USA)")
writer.add_document(Name=u"Frogger 2 - Swampy's Revenge (USA)",
                    Title=u"Frogger 2 - Swampy's Revenge (USA)")
writer.add_document(Name=u"akumajou",
                    Title=u"Akuma-Jou Dracula (Japan ver. N)")
writer.commit()

with ix.searcher() as searcher:
    query = QueryParser("Name", ix.schema).parse(u'World Super')
    results = searcher.search(query)

    # Résultats
    found = results.scored_length()
    if results.has_exact_length():
        print("Scored", found, "of exactly", len(results), "documents")
    else:
        low = results.estimated_min_length()
        high = results.estimated_length()

        print("Scored", found, "of between", low, "and", high, "documents")

input("Press Enter to continue...")

コード例 #18

0

ファイルを表示

    ix = create_in('./indexdir/', schema)
    # add documents
    news = utility.io.load_news()
    writer = ix.writer()
    print('Add documents...')
    for i, x in enumerate(news):
        if i % 1000 == 0:
            print('\t%d documents have been added.' % i)
        writer.add_document(title='news_%06d' % (i + 1), content=x)
    writer.commit()
else:
    print('Directly open previous indexed directory...')
    ix = open_dir('./indexdir')

print('Searching...')
parser = QueryParser('content', schema=ix.schema)
with ix.searcher() as searcher:
    queries_1 = utility.io.load_queries()
    queries_2, news_index, relevance = utility.io.load_training_data()
    td_sz = len(relevance)
    L = list()
    for idx, keyword in enumerate(QUERIES):
        ques = queries_1[idx]
        popout = []
        popout_0 = []
        popout_1 = []
        popout_2 = []
        popout_3 = []
        for j in range(td_sz):
            if queries_2[j] == ques and relevance[j] == 3 and (news_index[j]
                                                               not in popout):

コード例 #19

0

ファイルを表示

ix = index.create_in("indexdir", schema)

writer = ix.writer()

for root, dirs, filenames in os.walk('line_item_pkls'):
    for f in filenames:
        if (f != '.DS_Store'):
            body = ""
            line_items = pickle.load(
                io.open('line_item_pkls/' + f, 'r', encoding='utf-8'))

            for item in line_items:
                for i in range(int(item["number"])):
                    body += item["main item"].decode('unicode-escape') + " "

            writer.add_document(title=f.decode('unicode-escape'), body=body)

writer.commit()

with ix.searcher() as searcher:
    qp = QueryParser("body", schema=ix.schema)

    print "Hello friends, what would you like to eat today?",
    terms = raw_input()

    q = qp.parse(terms.decode('unicode-escape'))
    results = searcher.search(q)

    print results[0:9]

コード例 #20

0

ファイルを表示

import sys
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import urllib.parse

ix = open_dir("index")

results = None
searcher = ix.searcher()
parser = QueryParser("title", ix.schema)
query = parser.parse(" ".join(sys.argv[1:]))
print(query)
results = searcher.search(query, limit=None)
print(results)
for result in results:
    print(result['title'], '\n\t', result['content'])

コード例 #21

0

ファイルを表示

ファイル: show_index.py プロジェクト: renfanzi/haoyuheng_project

def search(indices):
    inpval = 1
    # ----------》以上就是得到按照哪种分类搜索
    qparsers = []

    if inpval == 1:
        categories = []
        # 注意这里有问题，　当增量索引文档的时候，　容易出现重复，　那么数据也会重复，　尼玛原点文档就在ａｐｐ函数里
        for ix in indices:  # ix: FileIndex(FileStorage('indexdir'), 'superfamicom.db')
            cat = list(ix.schema._fields.keys())
            # ix.schema:  <Schema: ['Developer', 'Publisher', 'ReleaseDate', 'Title']>
            # ix.schema._fields {'ReleaseDate': TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None),....

            # cat:  ['Developer', 'ReleaseDate', 'Publisher', 'Title']
            print("cat:", cat)
            for val in cat:
                if val in categories:
                    val = val + " - " + ix.indexname  # 如果有重复的， 假设得以区分

                categories.append(val)

        # print("categories", categories)

        # ----------------->以上是得到categories【】， 实际是为了区分每个表的字段是否重复并都放都一个list

        s = "如果为1的话，是按类别选择， 选择按照那种分类: \n"

        # print(categories)
        inpval = 0
        # indices:
        # [FileIndex(FileStorage('indexdir'), 'dinosaur.db'), FileIndex(FileStorage('indexdir'), 'mmorpg.db'), FileIndex(FileStorage('indexdir'), 'superfamicom.db')]

        for i in range(0, len(categories)):  # go through all catergories giving options
            s += "\t" + str(i) + " to search in [" + categories[i] + "]  \n"
        s += "--->: "

        inpval = int(input(s))  # 输入从哪项进行查询

        for i in range(0, len(indices)):
            qparsers.append(QueryParser(categories[inpval], indices[i].schema))




        # results 为结果


    print("你要查询的内容:")
    inp = input("--->")

    data = inp.split('--->')
    print(data[0])
    queries = [qparsers[i].parse(data[0].strip()) for i in range(0, len(qparsers))]
    print("len(queries): ", len(queries))
    print("len(indices):", len(indices))
    # if len(data) > 1:
    #     limits = int(data[1].strip())
    #     limits = 10

    # limits 为显示结果的数量
    results = []
    stats = {}
    limit = 10
    for i in range(0, len(indices)):
        # indices: [FileIndex(FileStorage('indexdir'), 'dinosaur.db'),
        searcher = indices[i].searcher()
        res = searcher.search(queries[i], limit=limit)

        if len(res) != 0:
            # ix.indexname--> superfamicom.db
            stats[indices[i].indexname] = len(res)
        else:
            continue
        results.extend(res)

    # print(results)
    my_result = []
    for i in results:
        my_result.append(dict(i))

    print(my_result)

コード例 #22

0

ファイルを表示

    #cltk_index.index_corpus()

    #_results = cltk_index.corpus_query('amicitia')
    #_results = cltk_index.corpus_query('ἀνὴρ')
    #print(_results)

    user_dir = os.path.expanduser('~/cltk_data/user_data/search')
    output_file = 'amicitia.html'
    output_path = os.path.join(user_dir, output_file)

    _index = open_dir('/Users/kyle/cltk_data/latin/index/phi5/work/')
    query = 'amicitia'

    output_str = ''
    with _index.searcher() as searcher:
        _query = QueryParser("content", _index.schema).parse(query)
        results = searcher.search(_query, limit=None)
        results.fragmenter.charlimit = None

        # Allow larger fragments
        results.fragmenter.maxchars = 300
        # Show more context before and after
        results.fragmenter.surround = 50

        docs_number = searcher.doc_count_all()

        output_str += 'Docs containing hits: {}.'.format(
            docs_number) + '</br></br>'

        for hit in results:
            author = hit['author']

コード例 #23

0

ファイルを表示

ファイル: index_utils.py プロジェクト: jerry-ye-xu/trecBert

def run_query(text, index, bm25_params={}, **kwargs):#, qf="title_text_en^2 abstract_text_en^2 body_text_en^1.1", fields=['id','score'], size=1000, max_year=2016):
    if type(index) is pysolr.Solr:
        kwargs['verb']=1
        qf="text^1" if 'qf' not in kwargs else kwargs['qf']
        return_fields=['id','score'] if 'return_fields' not in kwargs else kwargs['return_fields'] #return fields
        size=1000 if 'size' not in kwargs else kwargs['size']
        max_year=2016 if 'max_year' not in kwargs else kwargs['max_year']
        parser='edismax' if 'parser' not in kwargs else kwargs['parser']

        if 'verb' in kwargs:
            print(text)
            print(qf)
            print(bm25_params)
        if len(bm25_params)>0:
            bm25.set_params(**bm25_params)

        q_params={"fl": ','.join(return_fields),
                  #"fq": "body_text_en:[* TO *] AND date_i:[* TO "+str(max_year)+"]",
                  "fq": "date_i:[* TO "+str(max_year)+"]",
                  #"pf": "abstract_text_en^1.2 title_text_en^2",
                  # "start": "1",
                  "rows": str(size),  # return maximum 1000 results,
                  "defType": parser
                  }
        if max_year==0 or max_year>=2016:
            q_params.pop('fq')
        if len(qf)>0:
            q_params["qf"]=qf
        result = index.search(text, **q_params)
        return result, return_fields
    else:
        kwargs['verb']=1
        qf="text^1" if 'qf' not in kwargs else kwargs['qf']
        return_fields=['id','score'] if 'return_fields' not in kwargs else kwargs['return_fields'] #return fields
        size=1000 if 'size' not in kwargs else kwargs['size']
        max_year=0 if 'max_year' not in kwargs else kwargs['max_year']
    #    parser='edismax' if 'parser' not in kwargs else kwargs['parser']
        qf_fields=[s.split("^")[0] for s in qf.split()]
        qf_boosts=[1 if len(s.split("^"))==1 else float(s.split("^")[1]) for s in qf.split()]
        qff=[f for f,b in zip(qf_fields,qf_boosts) if b!=0]
        qfb=[b for f,b in zip(qf_fields,qf_boosts) if b!=0]
        boost_dict={}
        for f,b in zip(qff, qfb):
            boost_dict[f]=b

        if 'verb' in kwargs:
            print(text)
            print(qf)
            print()
        output=[]
        if len(bm25_params)>0:
            w = scoring.BM25F(**bm25_params)
        else:
            w = scoring.BM25F()
            print('Default scoring')
        with index.searcher(weighting=w) as searcher:
            query = MultifieldParser(qff, index.schema,
                                     fieldboosts=boost_dict,
                                     group=OrGroup).parse(text)
            if max_year>0:
                mask_q = QueryParser("year", index.schema).parse("date_i:["+str(max_year)+" to]")
                results = searcher.search(query, limit=size, mask=mask_q)
            else:
                results = searcher.search(query, limit=size)
            for r in results:
                results_row={}
                results_row['score']=r.score
                for f in return_fields:
                    if f not in results_row:
    #                    print(r)
                        if f in r:
                            results_row[f]=r[f]
                        else:
                            results_row[f]=''
                output.append(results_row)
        return output, return_fields

#solr = pysolr.Solr("http://130.155.204.198:8983/solr/trec-cds-2016", timeout=1200)
#res1=run_query('adult^1 elderly^1 man^1 calf^1 pain^1 walking^1 uphill^1 history^1 ischemic^1 heart^1 disease^1 worsening^1 hypertension^1 despite^1 medication^1 compliance^1 physical^1 exam^1 right^1 carotid^1 bruit^1 lower^1 extremities^1 cool^1 diminished^1 dorsalis^1 pedis^1 pulses^1', solr, qf='title_text_en^2 abstract_text_en^2 body_text_en^1.1',max_year=2013, size=5)
#res2=run_query('adult^1 elderly^1 man^1 calf^1 pain^1 walking^1 uphill^1 history^1 ischemic^1 heart^1 disease^1 worsening^1 hypertension^1 despite^1 medication^1 compliance^1 physical^1 exam^1 right^1 carotid^1 bruit^1 lower^1 extremities^1 cool^1 diminished^1 dorsalis^1 pedis^1 pulses^1', solr, qf='text^1',max_year=2013, size=5)
#
#rs=[res1, res2]
#
#for r in rs:
#    for line in r:
#        print (line)
#    print ()

コード例 #24

0

ファイルを表示

    def corpus_query(self,
                     query,
                     save_file=None,
                     window_size=300,
                     surround_size=50):
        """Send query to a corpus's index. `save_file` is a filename.

        >>> cltk_index = CLTKIndex('latin', 'phi5')
        >>> results = cltk_index.corpus_query('amicitia')
        :type save_file: str
        """
        _index = open_dir(self.index_path)

        output_str = ''

        with _index.searcher() as searcher:
            _query = QueryParser("content", _index.schema).parse(query)
            results = searcher.search(_query, limit=None)
            results.fragmenter.charlimit = None

            # Allow larger fragments
            results.fragmenter.maxchars = window_size
            # Show more context before and after
            results.fragmenter.surround = surround_size

            docs_number = searcher.doc_count_all()

            output_str += 'Docs containing hits: {}.'.format(
                docs_number) + '</br></br>'

            for hit in results:
                author = hit['author']
                filepath = hit['path']
                output_str += author + '</br>'
                output_str += filepath + '</br>'

                with open(filepath) as file_open:
                    file_contents = file_open.read()

                highlights = hit.highlights("content",
                                            text=file_contents,
                                            top=10000000)
                lines = highlights.split('\n')
                #lines_numbers = [l for l in lines]
                lines_br = '</br>'.join(lines)
                lines_number_approx = len(lines)
                output_str += 'Approximate hits: {}.'.format(
                    lines_number_approx) + '</br>'

                output_str += lines_br + '</br></br>'

        if save_file:
            user_dir = os.path.expanduser('~/cltk_data/user_data/search')
            output_path = os.path.join(user_dir, save_file + '.html')

            try:
                with open(output_path, 'w') as file_open:
                    file_open.write(output_str)
            except FileNotFoundError:
                os.mkdir(user_dir)
                with open(output_path, 'w') as file_open:
                    file_open.write(output_str)
        else:
            return output_str

コード例 #25

0

ファイルを表示

ファイル: Flask_search.py プロジェクト: unixtime/AIL-framework

def search():
    query = request.form['query']
    q = []
    q.append(query)
    r = []  #complete path
    c = []  #preview of the paste content
    paste_date = []
    paste_size = []
    paste_tags = []
    index_name = request.form['index_name']
    num_elem_to_get = 50

    # select correct index
    if index_name is None or index_name == "0":
        selected_index = get_current_index()
    else:
        selected_index = os.path.join(baseindexpath, index_name)
    ''' temporary disabled
    # # TODO: search by filename/item id
    '''

    # Search full line
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)

    ix = index.open_dir(selected_index)
    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse("".join(q))
        results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
        for x in results:
            r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1))
            path = x.items()[0][1].replace(PASTES_FOLDER, '', 1)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(
                content) > max_preview_char else len(content) - 1
            c.append(content[0:content_range])
            curr_date = str(paste._get_p_date())
            curr_date = curr_date[0:4] + '/' + curr_date[
                4:6] + '/' + curr_date[6:]
            paste_date.append(curr_date)
            paste_size.append(paste._get_p_size())
            p_tags = r_serv_metadata.smembers('tag:' + path)
            l_tags = []
            for tag in p_tags:
                complete_tag = tag
                tag = tag.split('=')
                if len(tag) > 1:
                    if tag[1] != '':
                        tag = tag[1][1:-1]
                    # no value
                    else:
                        tag = tag[0][1:-1]
                # use for custom tags
                else:
                    tag = tag[0]

                l_tags.append((tag, complete_tag))

            paste_tags.append(l_tags)
        results = searcher.search(query)
        num_res = len(results)

    index_list = get_index_list()

    index_min = 1
    index_max = len(index_list)

    return render_template("search.html",
                           r=r,
                           c=c,
                           query=request.form['query'],
                           paste_date=paste_date,
                           paste_size=paste_size,
                           char_to_display=max_preview_modal,
                           num_res=num_res,
                           index_min=index_min,
                           index_max=index_max,
                           bootstrap_label=bootstrap_label,
                           paste_tags=paste_tags,
                           index_list=index_list)

コード例 #26

0

ファイルを表示

ファイル: whoosh_backend.py プロジェクト: notmikeb/text-sherlock

 def find_path(self, path):
     parser = QueryParser('path', self._index.schema)
     query = parser.parse(path)
     return self._search(query, limit=1)

コード例 #27

0

ファイルを表示

ファイル: test_whoosh.py プロジェクト: zmjm4/yaha

writer.add_document(
    title=u"document2",
    path=u"/b",
    content=u"The second one 你 中文测试中文 is even more interesting! 吃水果")

writer.add_document(title=u"document3", path=u"/c", content=u"买水果然后来世博园。")

writer.add_document(title=u"document4",
                    path=u"/c",
                    content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")

writer.add_document(title=u"document4", path=u"/c", content=u"咱俩交换一下吧。")

writer.commit()
searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)

for keyword in (u"水果世博园", u"你", u"first", u"中文", u"交换机", u"交换"):
    print "result of ", keyword
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print hit.highlights("content")
    print "=" * 10

words_train('movie.txt', 'movie_key.txt', 'movie.graph')
cor = YahaCorrector('movie_key.txt', 'movie.graph')
sugs = cor.suggest(u"刘牛德")
print " ".join(sugs)

コード例 #28

0

ファイルを表示

ファイル: whoosh_backend.py プロジェクト: notmikeb/text-sherlock

 def find_text(self, text, pagenum=1, limit=10):
     parser = QueryParser('content', self._index.schema)
     query = parser.parse(text)
     return self._search(query, pagenum, limit)

コード例 #29

0

ファイルを表示

ファイル: ir_sentence_from_feature.py プロジェクト: emitza/UUX-Elsa-Ada

       sentences = sent_detector.tokenize(review)

       writer = ix.writer()

       for sentence in sentences:
        writer.add_document(content=unicode(sentence))

       feature = agreement[1] if agreement[1] is not None else agreement[2]

       if(feature == "size limit"):
           print "stop"

       writer.commit()
       sentences_feature = ""
       with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema, group=qparser.OrGroup).parse(unicode(feature))
        results = searcher.search(query)
        if(len(results) == 0):
            sentences_feature = agreement[6]
        else:
            sentences_feature = results[0]["content"]

        # Prepare SQL query to INSERT a record into the database.
        sql = "UPDATE agreements SET sentence = (%s) WHERE id = (%s) "
        try:
            # Execute the SQL command
            cursor.execute(sql, (str(sentences_feature).encode('ascii','ignore'), agreement[5]))
            # Commit your changes in the database
            db.commit()
        except Exception, e:
            # Rollback in case there is any error

コード例 #30

0

ファイルを表示

def main():
    arg_parser = argparse.ArgumentParser(
        description='Perform a full-text search over stored telegram messages.',
        prog='tgminer-search')

    arg_parser.add_argument('--version',
                            action='version',
                            version='%(prog)s ' + tgminer.__version__)

    arg_parser.add_argument('query', help='Query text.')

    arg_parser.add_argument(
        '--config',
        help='Path to TGMiner config file, defaults to "CWD/config.json". '
        'This will override the environmental variable '
        'TGMINER_CONFIG if it was defined.')

    arg_parser.add_argument(
        '--limit',
        help='Results limit, 0 for infinite. Default is 10.',
        type=query_limit(arg_parser),
        default=10)

    arg_parser.add_argument(
        '--markov',
        help=
        'Generate a markov chain file from the messages in your query results.',
        metavar='OUT_FILE')

    arg_parser.add_argument(
        '--markov-state-size',
        default=None,
        help=
        'The number of words to use in the markov model\'s state, default is 2. '
        'Must be used in conjunction with --markov.',
        type=markov_state_size(arg_parser))

    arg_parser.add_argument(
        '--markov-optimize',
        default=None,
        choices=('accuracy', 'size'),
        help='The default option "accuracy" produces a larger chain file where '
        'all trailing word/sequence probabilities are considered for every word in '
        'a message. This can result in a very large and slow to load chain if the '
        'state size is set to a high value. Setting this to "size" will cause '
        'trailing probabilities for the words inside the sequence that makes up a state '
        'to be discarded, except for the last word. This will make the chain smaller '
        'but results in more of an approximate model of the input messages.')

    args = arg_parser.parse_args()

    if args.markov_state_size is not None and args.markov is None:
        arg_parser.error(
            'Must be using the --markov option to use --markov-state-size.')

    if args.markov_optimize is not None and args.markov is None:
        arg_parser.error(
            'Must be using the --markov option to use --markov-optimize.')

    if args.markov_state_size is None:
        args.markov_state_size = 2

    if args.markov_optimize is None:
        args.markov_optimize = 'accuracy'

    config = None  # hush intellij highlighted undeclared variable use warning

    config_path = tgminer.config.get_config_path(args.config)

    if os.path.isfile(config_path):
        try:
            config = tgminer.config.TGMinerConfig(config_path)
        except tgminer.config.TGMinerConfigException as e:
            enc_print(str(e), file=sys.stderr)
            exit(exits.EX_CONFIG)
    else:
        enc_print(f'Cannot find tgminer config file: "{config_path}"')
        exit(exits.EX_NOINPUT)

    index = whoosh.index.open_dir(os.path.join(config.data_dir, 'indexdir'))

    index_lock_path = os.path.join(config.data_dir, 'tgminer_mutex')

    schema = tgminer.fulltext.LogSchema()

    query_parser = QueryParser('message', schema=schema)

    query = query_parser.parse(args.query)

    def result_iter():
        with fasteners.InterProcessLock(index_lock_path):
            with index.searcher() as searcher:
                yield from searcher.search(
                    query,
                    limit=None if args.limit < 1 else args.limit,
                    sortedby='timestamp')

    if args.markov:
        split_by_spaces = re.compile('\s+')

        chain = kovit.Chain()

        if args.markov_optimize == 'accuracy':
            word_iter = kovit.iters.iter_window
        else:
            word_iter = kovit.iters.iter_runs

        anything = False
        for hit in result_iter():
            message = hit.get('message', None)
            if message:
                anything = True
                for start, next_items in word_iter(
                        split_by_spaces.split(message),
                        args.markov_state_size):
                    chain.add_to_bag(start, next_items)

        if not anything:
            enc_print('Query returned no messages!', file=sys.stderr)
            exit(exits.EX_SOFTWARE)

        try:
            with open(args.markov, 'w', encoding='utf-8') as m_out:
                chain.dump_json(m_out)
        except OSError as e:
            enc_print(
                f'Could not write markov chain to file "{args.markov}", error: {e}',
                file=sys.stderr)
            exit(exits.EX_CANTCREAT)
    else:
        for hit in result_iter():

            message = hit.get('message', None)

            username = hit.get('username', None)
            alias = hit.get('alias', 'NO_ALIAS')

            to_username = hit.get('to_username', None)
            to_alias = hit.get('to_alias', None)
            to_id = hit.get('to_id')

            username_part = f' [@{username}]' if username else ''

            timestamp = config.timestamp_format.format(hit['timestamp'])

            chat_slug = hit['chat']

            media = hit.get('media', None)

            to_username_part = f' [@{to_username}]' if to_username else ''

            to_user_part = f' to {to_alias}{to_username_part}' if to_alias or to_username_part else ''

            if media:
                caption_part = f' Caption: {message}' if message else ''

                enc_print(
                    f'{timestamp} chat="{chat_slug}" to_id="{to_id}"{to_user_part} | {alias}{username_part}: {media}{caption_part}'
                )
            else:
                enc_print(
                    f'{timestamp} chat="{chat_slug}" to_id="{to_id}"{to_user_part} | {alias}{username_part}: {hit["message"]}'
                )