Example #1
0
    def search_metaindex_by_keyword(self, text, limit=None, timelimit=1):
        """
            Performs a query in the metadata search index by the 'key' field.
            Arguments:
                text: String used to perform the search in the index.
                limit: Maximum number of results to be returned. By default there is no limit.
                timelimit: Maximum number of seconds to execute the search. Searches that
                           take longer than timelimit will return only partial results.
            Returns:
                A list of dictionaries, each containing the fields in the metadata
                index, whose values match the query text in the 'key' field.
        """
        results_list = []
        if self.metaindex:
            with self.metaindex.searcher() as searcher:
                query = QueryParser('key', self.metaindex.schema).parse(text)
                coll = searcher.collector(limit)
                tlc = TimeLimitCollector(coll, timelimit, use_alarm=False)

                # Try searching
                try:
                    searcher.search_with_collector(query, tlc)
                except TimeLimit:
                    print(
                        "searchByKeyWord: Index search took too long, aborting!"
                    )

                # get partial results, if available
                results = tlc.results()
                for res in results:
                    results_list.append(dict(res))

        return results_list
Example #2
0
 def doSearch(self, text):
     q = self.qp.parse(text)  # build query
     with self.ix.searcher(
             weighting=scoring.Frequency) as s:  # simple scorer may help
         c = s.collector(limit=self.MaxResults)
         c = TimeLimitCollector(c, 0.5)
         try:
             s.search_with_collector(q, c)
         except:
             print("TIMEOUT!")
         results = c.results()  # partial results if hung
         self.searchResults.clear()
         #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60)
         my_cf = highlight.ContextFragmenter(maxchars=160, surround=30)
         #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n')
         results.fragmenter = my_cf
         if len(results) > 0:
             for res in results:
                 res.fragmenter = my_cf
                 # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n')
                 self.searchResults.append(res.highlights('Text', top=1))
                 self.searchResults.append('-Link to Meeting -')
                 self.searchResults.append(res['MeetingLink'] + '\n')
                 self.searchResults.append('----------')
                 self.searchResults.append('----------')
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)
Example #3
0
File: ir.py Project: BinbinBian/qb
    def full_search(self, query, time_limit=-1, search_limit=50,
                    edit_dist=0):
        val = {}

        try:
            searcher = self._index.searcher(weighting=scoring.TF_IDF())
            if time_limit > 0:
                c = searcher.collector(limit=search_limit)
                tlc = TimeLimitCollector(c, timelimit=time_limit)
                try:
                    searcher.search_with_collector(query, tlc)
                except TimeLimit:
                    None
                try:
                    res = tlc.results()
                except TimeLimit:
                    res = []
            else:
                res = searcher.search(query, limit=search_limit)

            for ii in res:
                val[ii['title']] = (ii.docnum, self.scale(ii.score))
        finally:
            searcher.close()
        return val
Example #4
0
 def doSearch(self, text):
     q = self.qp.parse(text)          # build query with event-provided search key
     with self.ix.searcher(weighting = scoring.BM25F) as s:    # there are several NLP style scorers for Whoosh
         c = s.collector(limit=self.MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
         c = TimeLimitCollector(c,0.5)               
         try:
             s.search_with_collector(q,c)
         except:
             print("TIMEOUT!")                       # DEBUG out put to console if we're timing out a lot  
         results = c.results()                       # If we do get a timeout, still return whatever we've got, i.e. partial results 
                                                     #-----------------------------------------------------
         self.searchResults.clear()                  # ** Now format the results for display ** 
         results.fragmenter = WholeFragmenter()      # we want the full technical name not just the local context.
         self.MaudeResults.clear()                  # Clear
         if len(results)> 0:
             self.results = [] 
             for res in results:
                 self.results.append(res['msid'])
                 HighLightedMsid = res.highlights('msid')  # construct MSID string with highlights, if that's where the match is... 
                 if len(HighLightedMsid) >0:
                     msid_str = HighLightedMsid
                 else:
                     msid_str = res['msid']
                 HighLightedTechName = res.highlights('technical_name')  # construct technical_name string with highlights, if relevant
                 if len(HighLightedTechName) >0:
                     tech_str = HighLightedTechName
                 else:
                     tech_str = res['technical_name']
                 self.searchResults.append(msid_str + ' - ' + tech_str)
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)     # return cursor to beginning of search results     
Example #5
0
    def search(self, text: str, limit: int, timelimit=3.0):
        with self.index.searcher() as searcher:
            or_group = OrGroup.factory(.9)
            parser = MultifieldParser(['content', 'quiz_bowl'],
                                      schema=self.schema,
                                      group=or_group)
            text_query = parser.parse(text)
            collector = searcher.collector(limit=limit)
            tlc = TimeLimitCollector(collector, timelimit=timelimit)
            partial = True
            try:
                searcher.search_with_collector(text_query, tlc)
                partial = False
            except searching.TimeLimit:
                pass

            # There is a bug in whoosh that makes calling len directory or indirectly fail
            # which is why we don't use list()
            results = [(r['page'], r.score) for r in tlc.results()]

            # Doing logging using partial instead of directory is required due to a mysterious race
            # condition between whoosh time limits and log.info. Its important that all of whoosh's
            # functions including search_with_collector() and tlc.results() are called before
            # logging anything
            if partial:
                log.info(
                    'Search took longer than {}s, getting partial results'.
                    format(timelimit))

            if len(results) == 0:
                return [('<UNK_ANSWER>', 0)]

            return results
Example #6
0
def test_video():
    index_path = os.path.join(config.index_root_dir, 'video')
    storage = FileStorage(index_path)
    ix = storage.open_index()
    with ix.searcher() as searcher:
        #print list(searcher.lexicon('title'))
        myquery = Term('title', u'全面')
        #myquery = Term('movieid', u'mi1022160')
        tc = searcher.collector(limit=20)
        tlc = TimeLimitCollector(tc, timelimit=1)  #limit seacher time
        searcher.search_with_collector(myquery, tlc)
        #for hit in tlc.results():
        #print hit.fields()
        #    print hit.fields().get('title')
        print '==========================='
        results = searcher.search_page(myquery, 1, 10)
        #for hit in results:
        #    print hit.fields().get('title')
        print '==============================='
        parser = MultifieldParser(['title', 'pinyin_title'], ix.schema)
        #    parser = QueryParser('title', schema = ix.schema)
        q = parser.parse(u'quan mian')
        results = searcher.search_page(q, 1, 10)
        for hit in results:
            print hit.fields()
Example #7
0
def test_media():
    index_path = os.path.join(config.index_root_dir, 'media')
    storage = FileStorage(index_path)
    ix = storage.open_index()
    with ix.searcher() as searcher:
        #print list(searcher.lexicon('title'))
        myquery = Term('title', u'尾巴')
        #myquery = Term('movieid', u'mi1022160')

        tc = searcher.collector(limit=200)
        tlc = TimeLimitCollector(tc, timelimit=1)  #limit seacher time
        searcher.search_with_collector(myquery, tlc)
        for hit in tlc.results():
            #print hit.fields()
            print hit.fields()
Example #8
0
 def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5):
     ''' Initializes the wrapper object with ijdex reference and preferences
         parameter MSID_index_dir        = (string) Existing Whoosh Index directory
         parameter Searchable            = (string) List of fieldnames of the index to search
         parameter MaxResults       = (numeric) Maximum # of results to return
         parameter Timeout       = (numeric) Maximum # of seconds to wait before ending search
     
     '''
     self.ix = index.open_dir(MSID_index_dir)                         #  
     self.qp = MultifieldParser(Searchable, schema=self.ix.schema)    # Search all the specified fields
     #self.qp =  QueryParser(Searchable[0], schema=self.ix.schema)    # Search ONLY the first field
     #self.s = self.ix.searcher(weighting = scoring.Frequency)        # Simple Scorer
     self.s = self.ix.searcher(weighting = scoring.BM25F)         # Fancy Scorer
     c = self.s.collector(limit=MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
     self.c = TimeLimitCollector(c,Timeout)               
     self.Searchable = Searchable
     self.LastResults = None
Example #9
0
File: qdb.py Project: BinbinBian/qb
    def find_closest(self, raw_query, threshold=50):
        """
        Returns the best score of similarity
        """
        from whoosh import qparser
        from whoosh.qparser import QueryParser
        from fuzzywuzzy import fuzz
        from extractors.ir import IrIndex

        if self.parser is None:
            og = qparser.OrGroup.factory(0.9)
            self.parser = QueryParser("text", schema=self.schema, group=og)

        query_text, query_len = IrIndex.prepare_query(raw_query.lower())
        print("Query: %s" % query_text)
        query = self.parser.parse(query_text)
        print("-------------")
        closest_question = -1
        with self.index.searcher() as s:
            c = s.collector(limit=10)
            tlc = TimeLimitCollector(c, timelimit=5)
            try:
                s.search_with_collector(query, tlc)
            except TimeLimit:
                None
            try:
                results = tlc.results()
            except TimeLimit:
                print("Time limit reached!")
                return -1

            print(results[0]['id'], self.raw[results[0]['id']][:50])
            similarity = fuzz.ratio(self.raw[results[0]['id']],
                                    raw_query.lower())
            if similarity > threshold:
                closest_question = results[0]['id']
                print("Old!", closest_question, similarity)
            else:
                print("NEW!  %f" % similarity)
        print("-------------")
        return closest_question
Example #10
0
	def searchIndex(self, sq):
		indexParser = MultifieldParser(["query", "target"], schema=self.schema).parse(unicode(sq))
		with self.ix.searcher() as s:
			collector = s.collector(limit=None)
			timed_collector = TimeLimitCollector(collector, timelimit=30.0)
			
			try:
				results = s.search_with_collector(indexParser, timed_collector)
			except TimeLimit:
				print 'Search ime limit of 30 seconds exceeded.'
			
			hits = timed_collector.results()
			
			# Convert result structure into a jsonable list
			# TODO: improve this structure
			matches = []
			for i in hits:
				matches.append({"sourcelang": i["query"],
								"targetlang": i["target"],
								"distance": (1.0/i.score)})
			return matches
Example #11
0
#!/usr/local/bin/python
#-*- encoding:utf-8 -*- 
 
from whoosh.index import open_dir  
from whoosh.fields import *  
from whoosh import qparser;
from chinesetokenizer import ChineseAnalyzer
#from whoosh.analysis import RegexAnalyzer  
#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
from whoosh.collectors import TimeLimitCollector, TimeLimit
analyzer = ChineseAnalyzer()

ix = open_dir('IndexDir/titleIndex'); 


with ix.searcher() as searcher:
    qp = qparser.QueryParser("content", ix.schema,group=qparser.syntax.OrGroup);
    c = searcher.collector(limit=10);
    tlc = TimeLimitCollector(c, timelimit=15);
    q = qp.parse(u'五子棋GOMOKU')
    for pair in q.all_terms():
        print pair;
    results = searcher.search_with_collector(q, tlc);
    print 'Here'
    if results.has_matched_terms():
        print('YYY',results.matched_terms())
    if 0 != len(results):
        for hit in results:
        	print 'xxx';
        	print hit['content'].encode('utf-8');
Example #12
0
def cal_sim(train_data_path,
            test_data_path,
            dst_result_path=None,
            save_n_best_search=1):
    schema = Schema(context=TEXT(stored=True),
                    response=STORED,
                    post=TEXT(stored=True))
    index_i = re.findall('\d', train_data_path)[0]

    index_path = "../tmp/ix_index/" + index_i
    if not os.path.exists(index_path):
        os.makedirs(index_path)

    ix = create_in(index_path, schema)
    writer = ix.writer()

    def get_cpr(line):
        lines = line.lower().strip().split('\t')
        context = ''
        post = lines[0]
        response = lines[1]
        return context.strip().decode('utf-8'), response.decode(
            'utf-8'), post.decode('utf-8')

    def load_train_data(file_name, writer):
        f = open(file_name)
        for line in f:
            context, response, post = get_cpr(line)
            if context != '':
                writer.add_document(context=context,
                                    response=response,
                                    post=post)
            else:
                writer.add_document(response=response, post=post)
        writer.commit()

    def get_query(line, ix):
        lines = line.strip().split('\t')
        post = lines[0].decode('utf-8')
        q2 = QueryParser("post", ix.schema).parse(post)
        terms = list(q2.all_terms())
        query = Or([Term(*x) for x in terms])
        return query

    load_train_data(train_data_path, writer)

    f = open(test_data_path, 'r')
    fw_search = open(dst_result_path, 'w')
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        c = searcher.collector(limit=10)
        tlc = TimeLimitCollector(c, timelimit=10.0)
        for line in f:
            try:
                query = get_query(line, ix)
                searcher.search_with_collector(query, tlc)
                results = tlc.results()
                for i in range(min(len(results), save_n_best_search)):
                    fw_search.write(line.strip() + '\t' +
                                    str(results[i]["post"]) + '\t' +
                                    str(results[i]["response"]) + '\n')
            except Exception as e:
                print('TimeLimit, ignore it!')
                print(line)
    fw_search.close()
Example #13
0
        query[i] = QueryParser("content",
                               ix.schema).parse(sentenceToBeParsed[i])

    # Top 'n' documents as result
    #topN = 2
    overlaps = [set() for _ in range(indexStore)]
    overlapCount = 0
    with ix.searcher() as searcher:
        # Get a collector object

        print("Finished loading searcher")
        for i, k in zip(range(0, len(fullSentence)),
                        tqdm(range(len(overlaps)))):
            c = searcher.collector(limit=50, terms=True)
            # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
            tlc = TimeLimitCollector(c, timelimit=120.0)

            # Try searching

            try:
                searcher.search_with_collector(query[i], tlc)
            except TimeLimit:
                print("Search took too long, aborting!")
            results = tlc.results()

            #results = searcher.search(query, terms=True,limit=10)

            #results= searcher.search(query,limit=10)
            if results.scored_length() > 0:
                overlapCount += 1
                for j in range(0, results.scored_length()):
Example #14
0
def home(request):
    title = "Search text"
    form = SearchForm(request.POST or None)

    context = {"title": title, "form": form}

    if form.is_valid():
        instance = form.save(commit=False)
        instance.save()

        message = "You will get search results for: %s via %s soon" % (
            instance.searching_text, instance.email)
        context = {
            "title": "Thank you",
            "message": message,
        }

        with ix.searcher() as searcher:
            query = QueryParser("text",
                                ix.schema).parse(instance.searching_text)
            # Get a collector object
            c = searcher.collector(limit=None)
            # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
            tlc = TimeLimitCollector(c, timelimit=instance.t_limit)
            # Try searching
            try:
                searcher.search_with_collector(query, tlc)
            except TimeLimit:
                pass
            # You can still get partial results from the collector
            results = tlc.results()
            lst = []
            for i in range(0, len(results)):
                st = ''
                st += 'Book: '
                st += results[i]["book"]
                st += ', chapter: '
                st += results[i]["chapter"]
                st += ', page: '
                st += str(results[i]["page"])
                lst.append(st)

        # with ix.searcher() as searcher:
        #     query = QueryParser("text", ix.schema).parse(instance.searching_text)
        #     results = searcher.search(query)
        #     lst = []
        #     for i in range(0, len(results)):
        #         st = ''
        #         st += 'Book: '
        #         st += results[i]["book"]
        #         st += ', chapter: '
        #         st += results[i]["chapter"]
        #         st += ', page: '
        #         st += str(results[i]["page"])
        #         lst.append(st)

        logging.basicConfig(
            format=u'%(levelname)-8s [%(asctime)s] %(message)s',
            level=logging.DEBUG,
            filename=u'mylog.log')
        time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp
        logging.info(time_diff.total_seconds())

        subject = 'Search results for: ' + form.cleaned_data.get(
            'searching_text')
        message = 'Search results for: ' + form.cleaned_data.get(
            'searching_text') + '\n'
        for i in range(0, len(lst)):
            message += str(i + 1) + ') '
            message += lst[i]
            message += '\n'
        from_email = settings.EMAIL_HOST_USER
        to_email = form.cleaned_data.get('email')
        send_mail(subject, message, from_email, [to_email], fail_silently=True)
    return render(request, "home.html", context)