Beispiel #1
0
    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)
    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)
Beispiel #3
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
Beispiel #4
0
    def displayTokens(cls, analyzer, text):

        tokenStream = analyzer.tokenStream("contents", StringReader(text))
        term = tokenStream.addAttribute(TermAttribute.class_)

        while tokenStream.incrementToken():
            print "[%s]" % (term.term()),
Beispiel #5
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Beispiel #6
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Beispiel #7
0
    def assertAnalyzesTo(cls, analyzer, input, outputs):

        stream = analyzer.tokenStream("field", StringReader(input))
        termAttr = stream.addAttribute(TermAttribute.class_)
        for output in outputs:
            if not stream.incrementToken():
                raise AssertionError, 'stream.incremementToken()'
            if output != termAttr.term():
                raise AssertionError, 'output == termAttr.term())'

        if stream.incrementToken():
            raise AssertionError, 'not stream.incremementToken()'

        stream.close()
Beispiel #8
0
    def dumpSpans(self, query):

        spans = query.getSpans(self.reader)
        print "%s:" % query
        numSpans = 0

        scoreDocs = self.searcher.search(query, 50).scoreDocs
        scores = [0, 0]
        for scoreDoc in scoreDocs:
            scores[scoreDoc.doc] = scoreDoc.score

        while spans.next():
            numSpans += 1

            id = spans.doc()
            doc = self.reader.document(id)

            # for simplicity - assume tokens are in sequential,
            # positions, starting from 0
            stream = self.analyzer.tokenStream("contents",
                                               StringReader(doc.get("f")))
            term = stream.addAttribute(TermAttribute.class_)

            buffer = StringIO()
            buffer.write("   ")

            i = 0
            while stream.incrementToken():
                if i == spans.start():
                    buffer.write("<")

                buffer.write(term.term())
                if i + 1 == spans.end():
                    buffer.write(">")

                buffer.write(" ")
                i += 1

            buffer.write("(")
            buffer.write(str(scores[id]))
            buffer.write(") ")

            print buffer.getvalue()
            # print self.searcher.explain(query, id)

        if numSpans == 0:
            print "   No spans"

        print ''
Beispiel #9
0
    def displayTokensWithPositions(cls, analyzer, text):

        stream = analyzer.tokenStream("contents", StringReader(text))
        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

        position = 0
        while stream.incrementToken():
            increment = posIncr.getPositionIncrement()
            if increment > 0:
                position = position + increment
                print "\n%d:" % (position),

            print "[%s]" % (term.term()),
        print
Beispiel #10
0
    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc
Beispiel #11
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
Beispiel #12
0
    def displayTokensWithFullDetails(cls, analyzer, text):

        stream = analyzer.tokenStream("contents", StringReader(text))

        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
        offset = stream.addAttribute(OffsetAttribute.class_)
        type = stream.addAttribute(TypeAttribute.class_)

        position = 0
        while stream.incrementToken():
            increment = posIncr.getPositionIncrement()
            if increment > 0:
                position = position + increment
                print "\n%d:" % (position),

            print "[%s:%d->%d:%s]" % (term.term(), offset.startOffset(),
                                      offset.endOffset(), type.type()),
        print
Beispiel #13
0
    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" %(exitCode)

            return doc
Beispiel #14
0
    def testJumps(self):

        stream = self.synonymAnalyzer.tokenStream("contents",
                                                  StringReader("jumps"))
        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

        i = 0
        expected = ["jumps", "hops", "leaps"]
        while stream.incrementToken():
            self.assertEqual(expected[i], term.term())
            if i == 0:
                expectedPos = 1
            else:
                expectedPos = 0

            self.assertEqual(expectedPos, posIncr.getPositionIncrement())
            i += 1

        self.assertEqual(3, i)
Beispiel #15
0
    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["pdfinfo", "-enc", "UTF-8", path])
        except:
            raise
        else:
            while True:
                line = process.fromchild.readline().strip()
                if not line:
                    break
                name, value = line.split(':', 1)
                doc.add(
                    Field(name.strip(), value.strip(), Field.Store.YES,
                          Field.Index.NOT_ANALYZED))

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdfinfo exit code %d" % (exitCode)

        try:
            process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", path, "-"])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" % (exitCode)

            return doc
Beispiel #16
0
def search(request):

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    ret = {}
    maxLength = 38

    search_content = request.GET.get('content')
    if len(search_content) > maxLength:
        pass

    query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword",
                        analyzer).parse(search_content)
    scoreDocs = searcher.search(query, 50).scoreDocs

    scorer = QueryScorer(query)
    formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
    highlighter = Highlighter(formatter, scorer)
    fragmenter = SimpleFragmenter(50)
    highlighter.setTextFragmenter(fragmenter)

    ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents."

    print ret['NumOfDocs']

    conn = pymysql.connect(host='localhost',
                           user=user,
                           password=password,
                           db=db_name,
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)

    rst = ''
    ret['search_list'] = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        _id = str(doc.get("id"))
        print _id
        sql = 'select * from webpage where id=%s'

        with conn.cursor() as cursor:
            cursor.execute(sql, (_id))
            rst = cursor.fetchone()

        titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "title", StringReader(rst['title']))
        titleFragment = highlighter.getBestFragment(titleStream, rst['title'])
        if titleFragment is None:
            titleFragment = rst['title']

        contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "content", StringReader(rst['content']))
        contentFragment = highlighter.getBestFragments(contentStream,
                                                       rst['content'], 5,
                                                       '...')

        ret['search_list'].append({
            'title': titleFragment,
            'url': rst['url'],
            'content': contentFragment
        })
    #searcher.close()
    conn.close()

    return render(request, 'tjut/result.html', {
        'search_list': ret['search_list'],
        'search_content': search_content
    })
Beispiel #17
0
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)

    # Set the fragment size. We break text in to fragment of 64 characters
    fragmenter  = SimpleSpanFragmenter(query_score, 64)
    highlighter.setTextFragmenter(fragmenter)

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        text = doc.get(FIELD_CONTENTS)
        ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text))
        print doc.get(FIELD_PATH)
        print highlighter.getBestFragments(ts, text, 3, "...")
    print ""