Beispiel #1
0
    def query(self, q):
        # parser
        qpcontent = QueryParser("fullText", schema=self.indexer.schema)
        qpanchor = QueryParser("anchorText", schema=self.indexer.schema)
        qpcontent.add_plugin(qparser.OperatorsPlugin())
        qpanchor.add_plugin(qparser.OperatorsPlugin())
        # query
        qcontent = qpcontent.parse(q)
        qanchor = qpanchor.parse(q)
        resWeb = []
        with self.indexer.searcher() as s:
            resContent = s.search(qcontent, limit=40)
            resAnchor = s.search(qanchor, limit=40)
            resFinal = resAnchor
            resFinal.upgrade_and_extend(resContent)

            respgMap = {}
            resAnchorMap = {}
            resTextMap = {}
            resURLMap = {}

            for r in resFinal:
                resURLMap[r['pageURL']] = r
            for r in resFinal:
                respgMap[r['pageURL']] = self.pgrank[self.allFilesMap[
                    r['pageURL']]]
                resAnchorMap[r['pageURL']] = r['anchorText']
                resTextMap[r['pageURL']] = r['fullText']

            supportedRes = []
            for r in respgMap:
                supportedRes.append((r, respgMap[r]))

            # consolidate the result by VSMSP algorithm
            ii = 0
            while ii + 10 < len(supportedRes):
                supportedRes[ii:ii + 10] = sorted(supportedRes[ii:ii + 10],
                                                  key=operator.itemgetter(1))
                ii = ii + 10

            supportedRes[ii:] = sorted(supportedRes[ii:],
                                       key=operator.itemgetter(1))

            for r in supportedRes:
                hts = self.__cleanhtml(resURLMap[r[0]].highlights("anchorText",
                                                                  top=3))
                hts = hts + self.__cleanhtml(resURLMap[r[0]].highlights(
                    "fullText", top=2))
                resWeb.append([resURLMap[r[0]]["title"], r[0], hts])
                """
                print(resURLMap[r[0]]["title"])
                print(r[0])
                print(r[1])
                print(self.__cleanhtml(resURLMap[r[0]].highlights("anchorText",top=3)))
                print(self.__cleanhtml(resURLMap[r[0]].highlights("fullText",top=2)))
                print("\n\n\n")
                """

        return resWeb
Beispiel #2
0
    def search(self, q, stemmed, syn):
        path = os.path.dirname(__file__)
        ix = open_dir(path + "/index")
        parser = MultifieldParser(["title", "content"], ix.schema)
        searcher = ix.searcher()
        cp = qparser.OperatorsPlugin(And="&", Or="\|", AndNot="!")
        parser.replace_plugin(cp)

        if syn == True:

            for w in q.split(" "):
                synonyms = self.getGermanSynonyms(w)
                synQ = "(" + w
                for s in synonyms:
                    synQ += "\|" + s
                synQ += ")"
                qNew = q.replace(w, synQ)
            q = qNew

        searchQuery = parser.parse(q)
        allresults = searcher.search(searchQuery, limit=None)

        if stemmed == True:
            results = self.searchGermanStemmed(q, ix)
            allresults.upgrade_and_extend(results)
            results = self.searchFrenchStemmed(q, ix)
            allresults.upgrade_and_extend(results)

        print(searchQuery)

        return allresults
Beispiel #3
0
def full_textsearch(key, top, surround):
    ix = open_dir("/Users/silky/Documents/GitHub/full_text_search",
                  "engine2")  #作成したインデックスファイルのディレクトリを指定
    with ix.searcher() as searcher:
        parser = qparser.QueryParser("content", ix.schema)

        op = qparser.OperatorsPlugin(And="&", Or="\\|", Not="~")
        parser.replace_plugin(op)  #opをセット
        words = key
        start = time.time()
        words = words.split()
        words = "".join(words)
        query = parser.parse(words)
        results = searcher.search(query, limit=50)
        # for result in results:asae_iku来る
        #     print(result.highlights('content'))
        results.fragmenter.maxchars = 150
        results.fragmenter.surround = surround
        results.order = highlight.SCORE
        pprint(results)
        search_result = []
        for fragment in results:
            dict = {}
            # print(fragment.highlights('content',top=top))

            tem = final.sub(
                '',
                after.sub(
                    '</proto>',
                    before.sub('<proto>',
                               fragment.highlights('content', top=top)),
                ))
            while pattern2.search(tem) != None:
                position = pattern2.search(tem).start()
                lenth = len(pattern2.search(tem).group())
                tem=tem[0:position] + \
                '<b class="match term0">'+tem[position+lenth:position+lenth*2-len('<proto> <b class="match term0"></b> </proto>')] +\
                '</b>' + tem[position+lenth*2-len('<proto> <b class="match term0"></b> </proto>'):]
            # tem[position:position+lenth*2-len('<proto> <b class="match term0"></b> </proto>')] =\
            tem = pattern1.sub('', tem)
            tem = pattern3.sub('', tem)
            tem = tem.replace(' ', '')
            tem = tem.replace('<bclass="matchterm0">',
                              '<b class="matchterm0">')
            tem = zhushi.sub('', tem)

            dict['context'] = tem

            print(dict['context'])
            dict['title'] = fragment['title']
            dict['writer'] = fragment['writer']
            search_result.append(dict)

            print('#' * 30, '\n')
        print("計%d記事" % len(results))
        print(str((time.time() - start) * 10000 // 10) + "ms")  #時間計測用
        return search_result
        def searchFrenchStemmed(self, q, ix):            
            stemmer = snowballstemmer.stemmer('french')             
            
            parser = MultifieldParser(["title", "titleStemmed", "contentStemmed" ,"content"], ix.schema)
            queryStemmedList = stemmer.stemWords(unicode(q).split())
            q = ""
            for w in queryStemmedList:
                q += w + " "

                
            searcher = ix.searcher()   
            cp = qparser.OperatorsPlugin(And="&", Or="\|", AndNot="!")
            parser.replace_plugin(cp) 
            searchQuery = parser.parse(q)
            print(searchQuery)
            results= searcher.search(searchQuery, limit=None)            
          
            return  results 
def full_textsearch(key, top, surround):
    ix = open_dir(current_app.config['engine'],
                  "engine2")  #作成したインデックスファイルのディレクトリを指定
    with ix.searcher() as searcher:
        parser = qparser.QueryParser("content", ix.schema)

        op = qparser.OperatorsPlugin(And="&", Or="\\|", Not="~")
        parser.replace_plugin(op)  #opをセット
        words = key
        start = time.time()
        words = words.split()
        words = "".join(words)
        query = parser.parse(words)
        results = searcher.search(query, limit=50)
        # for result in results:asae_iku来る
        #     print(result.highlights('content'))
        results.fragmenter.maxchars = 50
        results.fragmenter.surround = surround
        results.order = highlight.SCORE
        pprint(results)
        search_result = []
        for fragment in results:
            dict = {}

            dict['context'] = final.sub(
                '',
                after.sub(
                    '</a>',
                    before.sub('<a>', fragment.highlights('content', top=top)),
                ))
            dict['title'] = fragment['title']
            dict['writer'] = fragment['writer']
            search_result.append(dict)

            print('#' * 30, '\n')
        print("計%d記事" % len(results))
        print(str((time.time() - start) * 10000 // 10) + "ms")  #時間計測用
        return search_result
Beispiel #6
0
        for criteriaObj in totalRecordsObj:
            print criteriaObj['sub_category']
            file.write(str(criteriaObj['sub_category']) + "\n")
            testCriteriaArray = criteriaObj['test_criteria'].strip().split(
                ', ')
            for testCriteria in testCriteriaArray:
                print "\t", testCriteria
                file.write("\t" + str(testCriteria) + "\n")
                #, "email_addr", "title", "email_from", "email_to", "id", "email_addr_full"
                query = qparser.MultifieldParser([
                    "content", "email_addr", "title", "email_from", "email_to",
                    "id", "email_addr_full"
                ], ix.schema)
                op = qparser.OperatorsPlugin(And="&",
                                             Or="\|",
                                             Not="\~",
                                             AndNot="&!",
                                             AndMaybe="&~")
                query.replace_plugin(op)

                with ix.searcher(weighting=scoring.Frequency) as searcher:
                    qr = query.parse(unicode(testCriteria))
                    results = searcher.search(qr, limit=topN, terms=True)
                    if (len(results)):
                        if (topN > len(results)):
                            topN = len(results)
                        file.write("\tResult found:" + str(len(results)) +
                                   "\n")
                        #print 'Result found:', len(results)
                        #print 'Result found:', results.scored_length()
                        #print 'RRRR:', results.scored_length()