コード例 #1
0
ファイル: indexletter.py プロジェクト: Softcatala/conjugador
 def search(self):
     self.searcher = ix_letter.searcher()
     fields = []
     qs = u'index_letter:({0})'.format(self.word)
     fields.append("index_letter")
     self.query = MultifieldParser(fields, ix_letter.schema).parse(qs)
コード例 #2
0
# Python Application which provides live (search as you type) MSID results
import csv
import os, os.path
from whoosh import index
from whoosh.qparser import MultifieldParser
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NGRAM, NGRAMWORDS
from whoosh.qparser import QueryParser

## Constants
NgramList = ('MSID','TECHNICAL_NAME','DESCRIPTION')   # fields to search
## Input - what am I searching for?
MyQuery = "PCA"
MSID_index_dir = 'MSID_idx_7'                               #   Relative to current path.                   
## Open Index 
ix = index.open_dir(MSID_index_dir)                         #   TBD: add cmdline flag to set/use a particular index
Searchable = ('MSID','TECHNICAL_NAME', 'DESCRIPTION')        ## List of fieldnames to search on, others are 
qp = MultifieldParser(Searchable, schema=ix.schema)
q = qp.parse(MyQuery)
ix = index.open_dir(MSID_index_dir)  
with ix.searcher() as s:    
    results = s.search(q)
    print(len(results))
    print(results[0].keys)
    for res in results:
        print(res['MSID'] + ' - ' +res['TECHNICAL_NAME'])
    
コード例 #3
0
from whoosh import *
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.fields import *
import wikipedia

index_path = r"C:\Users\Abhi\Downloads\Index"
ix = open_dir(index_path)
mparser = MultifieldParser(["title", "content"], schema=ix.schema)


def search():
    query = input("Hi How can I help you")
    q = mparser.parse(str(query))
    with ix.searcher() as searcher:
        results = searcher.search(q, limit=20)
        for result in results:
            print(result['content'])
            print(wikipedia.summary(result['title'], sentences=2))


if __name__ == '__main__':
    search()
    '''if len(result) != 0:
        return render_template("search.html", results=result)
    else:
        return render_template("NotFound.html")'''
コード例 #4
0
    def parser(self,
               keyword,
               docType="all",
               sortType="None",
               fromDate="all time"):
        """
        Searches an Index for documents.

        Searches the Index for documents containing the selected keywords, with the selected parameters and the
        selected sorting method.

        Parameters
        ----------
        keyword : string
            Keywords to look for in the documents
        docType : string{'all',pdf','tex','py'}
            Type of document to look for.
        sortType: string{'None','By Number of ocurrences', 'By Date'}
            Sorting method to use.
        fromDate : string{'all time','this year', 'this month', 'this week'}
            Time interval to look for documents in.

        Returns
        -------
        bool
            Result of the operation.
        array
            Documents found in the search.

        """
        try:
            ix = open_dir(self.indexFolder)
        except EmptyIndexError:
            return [
                False,
                ("The index provided does not exist, make sure you add it before using it and do not "
                 + "delete it manually"),
            ]

        resultArray = []
        keyword = analyzeText(unidecode.unidecode(keyword))

        today = dt.now()
        date = ""

        if keyword == "":
            parseQuery = ""
        else:
            parseQuery = "content:" + keyword

        if fromDate != "all time":
            if fromDate == "this year":
                date = today - relativedelta(years=1)
            elif fromDate == "this month":
                date = today - relativedelta(months=1)
            elif fromDate == "this week":
                date = today - relativedelta(weeks=1)

            parseQuery = (parseQuery + " " + u"date:[" +
                          date.strftime("%Y%m%d") + " to " +
                          today.strftime("%Y%m%d") + "]")

        with ix.searcher() as searcher:
            if docType == "all":
                query = MultifieldParser(["content", "date"],
                                         schema=ix.schema).parse(parseQuery)
            else:
                query = MultifieldParser(
                    ["content", "date", "tags"],
                    schema=ix.schema).parse(parseQuery + " tags:" + docType)

            results = ""

            if sortType == "By Date":
                results = searcher.search(query, sortedby="date", reverse=True)
            elif sortType == "By Number of ocurrences":
                results = searcher.search(query,
                                          sortedby="nOccurrences",
                                          reverse=True)
            else:
                results = searcher.search(query)

            if results.is_empty():
                return (
                    False,
                    ("Não foram encontrados resultados com estes parâmetros de pesquisa"
                     ),
                )
            else:
                for result in results:
                    #                     result["nOccurrences"]
                    path = result["path"]
                    tag = result["tags"]

                    resultArray.append([path, tag])

        return True, resultArray
コード例 #5
0
ファイル: SearchApi.py プロジェクト: zhoulinfei/ScrapyDemo
 results = []
 # if __name__ == '__main__':
 #     index_path = "./sindex"
 # else:
 #     index_path = "./webspider/sindex"
 index_path = search_path
 try:
     ix = open_dir(index_path)  # 搜索路径
 except Exception, e:
     pass
 else:
     if not inputstring:
         pass
     else:
         publish_time = sorting.FieldFacet("publish_time", reverse=True)
         qp = MultifieldParser(["title", "body"], schema=ix.schema)
         with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
             querystring = qp.parse(inputstring)
             results = searcher.search(querystring,
                                       terms=True,
                                       limit=None,
                                       sortedby=[publish_time])
             # print(len(results))
             # results = searcher.search_page(querystring, page)
             html_parser = HTMLParser.HTMLParser()
             if len(results) > 0:
                 for i in xrange((page - 1) * size, page * size):
                     if i in xrange(len(results)):
                         tmpret = results[i].fields()
                         hit_keywords = set()
                         for key, val in results[i].matched_terms():
コード例 #6
0
 def _parser(fieldnames, schema, group, **kwargs):
     return MultifieldParser(fieldnames, schema, group=group, **kwargs)
コード例 #7
0
def main():
    args = parse_args()
    query = args.query
    number = args.number
    rank_func = args.rank_func
    index_loc = args.index_loc
    B = args.B
    weight_B = args.weight_B
    K1 = args.K1

    if query is None:
        query_list = read_query()
    else:
        temp_str = ' '
        query = temp_str.join(query)
        query_list = [query]

    if index_loc is None:
        index_loc = 'index'

    if weight_B is not None:
        rank_func = 1

    if rank_func == 1:
        B1, B2, B3, B4, B5 = get_B(weight_B)
        weighting = scoring.BM25F(B=B,
                                  K1=K1,
                                  title_B=B1,
                                  body_B=B2,
                                  category_B=B3,
                                  date_B=B4,
                                  rating_B=B5)
        rank_name = 'bm25f'
    elif rank_func == 2:
        weighting = scoring.TF_IDF()
        rank_name = 'tf-idf'
    elif rank_func == 3:
        weighting = scoring.Frequency()
        rank_name = 'frequency'
    else:
        weighting = scoring.BM25F(B=B, K1=K1)
        rank_name = 'bm25'

    ix = open_dir(index_loc)

    with ix.searcher(weighting=weighting) as searcher:
        # parser = QueryParser(schema=ix.schema)
        parser = MultifieldParser(
            ['title', 'body', 'category', 'date', 'rating'], schema=ix.schema)
        for this_query in query_list:
            que = parser.parse(this_query)
            print('\n')
            print('--', this_query)
            results = searcher.search(que, limit=number)
            if len(results) == 0:
                print(' ')
                print('no matched result. please try again.')
            else:
                for hit in results:
                    print(' ')
                    print('#', hit.rank, rank_name, 'score:',
                          round(hit.score, 10))
                    print('title:', hit['title'])
                    print('imdb:', hit['imdbid'], 'date:', hit['date'],
                          'rating:', hit['rating'], 'category:',
                          hit['category'])
                    print('body:', hit['body'])
コード例 #8
0
ファイル: searchText.py プロジェクト: LonSilent/udnSearch
analyzer = chinese.ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True),
                sub_title=TEXT(stored=True),
                author=TEXT(stored=True),
                content=TEXT(stored=True, analyzer=analyzer))

storage = FileStorage("indexdir")
ix = storage.open_index()
writer = ix.writer()

string = "桐花 樂團"
normal = False

with ix.searcher() as searcher:
    # og = qparser.OrGroup.factory(0.9)
    parser = MultifieldParser(["title", "sub_title", "author", "content"],
                              schema=ix.schema)
    # parser = qparser.QueryParser("content", ix.schema)
    parser.remove_plugin_class(qparser.PhrasePlugin)
    parser.add_plugin(qparser.SequencePlugin())

    if (normal):
        query = parser.parse(string)
    else:
        # proximity
        distance = 50
        proximty_query = "\"" + string + "\"" + '~' + str((1 + distance) * 3)
        query = parser.parse(proximty_query)

    print(query)
    results = searcher.search(query)
    # Allow larger fragments
コード例 #9
0
import whoosh.index
from whoosh.qparser import MultifieldParser, OrGroup, WildcardPlugin

whoosh_idx = whoosh.index.open_dir('whoosh_idx', indexname='nasdaq')
query_parser = MultifieldParser(['title', 'article'],
                                schema=whoosh_idx.schema,
                                group=OrGroup)
query_parser.remove_plugin_class(WildcardPlugin)

parsed_query = query_parser.parse('What market does FitBit compete in?')

with whoosh_idx.searcher() as searcher:
    search_results = searcher.search(parsed_query, limit=1)
    [print(sr['title']) for sr in search_results]