Exemple #1
0
def test():
    searcher = Searcher()
    print 'hi'
    hits = searcher.search('java',5026)
    print "%s total matching documents." % hits.length()
    for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            print 'title:', doc.get("author"), 'name:', doc.get("link")
            print Hit.cast_(hit).getScore()
Exemple #2
0
def b():
    from dot.searcher import Searcher, STORE_DIR
    from apps.wantown import dao
    from apps.wantown.models import Entry
    queries = ['sms','algorithm','java','google','mac','apple','淘宝','阿里巴巴','云计算','python','java google']
    searcher = Searcher()
    import datetime
    #fsock = open(str(datetime.datetime.now()),'w')
    for query in queries[:5]:
        hits = searcher.search(query)
        docs = []
        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            docs.append(doc)
        from dot.matrixmapper import MatrixMapper
        STOP_WORDS = [u'a', u'an', u'and', u'are', u'as', u'at', u'be', u'but', u'by', u'for', u'if', u'in', u'into', 
                  u'is', u'it', u'no', u'not', u'of', u'on', u'or', u'such', u'that', u'the', u'their', u'then',
                  u'there', u'these', u'they', u'this', u'to', u'was', u'will', u'with',
                  # add by myself
                  # 的这个词应不应该作为stop word呢
                  u'i',u'been',u'about',u'的',u'么',u'是',u'个',u'不',u'们',u'这',u'那',u'我',u'你',u'很',u'了',u'以',u'与',u'为',u'一']
        mapper = MatrixMapper(STOP_WORDS)
        print 'docs:',len(docs)
        label = mapper.build(docs[0:20])
        """
Exemple #3
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
    def purgeDocuments(self,
                       txn,
                       counter,
                       indexSearcher,
                       indexReader,
                       uItem,
                       toVersion=None):

        term = Term("item", uItem.str64())

        if toVersion is None:
            counter.documentCount += indexReader.deleteDocuments(term)

        else:
            x, keep = self.store._items.findValues(None, toVersion, uItem,
                                                   None, True)
            keep = set(keep)

            for hit in indexSearcher.search(TermQuery(term)):
                hit = Hit.cast_(hit)

                doc = hit.getDocument()
                ver = long(doc['version'])
                if ver <= toVersion and UUID(doc['value']) not in keep:
                    indexReader.deleteDocument(hit.getId())
                    counter.documentCount += 1
    def undoDocuments(self, indexSearcher, indexReader, uItem, version):

        term = Term("item", uItem.str64())

        for hit in indexSearcher.search(TermQuery(term)):
            hit = Hit.cast_(hit)
            if long(hit.getDocument()['version']) == version:
                indexReader.deleteDocument(hit.getId())
    def undoDocuments(self, indexSearcher, indexReader, uItem, version):

        term = Term("item", uItem.str64())

        for hit in indexSearcher.search(TermQuery(term)):
            hit = Hit.cast_(hit)
            if long(hit.getDocument()['version']) == version:
                indexReader.deleteDocument(hit.getId())
Exemple #7
0
def a():
    import os
    from lucene import CJKAnalyzer,Hit
    dire = os.path.dirname(__file__) + '/index'

    analyzer = CJKAnalyzer()
    searcher = IndexSearcher(dire)
    query = QueryParser('summary',analyzer).parse('java')#TermQuery(Term("type", "restaurant"))
    sort = Sort(SortField("locatisdon", CategoryComparatorSource('java')))

    hits = searcher.search(query,sort)
    print len(hits)
    i = 0
    for hit in hits:
            i+=1
            if i== 10:
                break
            doc = Hit.cast_(hit).getDocument()
            print 'title:', doc.get("author"), 'name:', doc.get("link")
            print Hit.cast_(hit).getScore()
    searcher.close()
Exemple #8
0
def transform(hit):
    hit = Hit.cast_(hit).getDocument()
    contents = hit.get("contents").strip()
    contents = contents.replace("\n", "<br />")
    r1 = r"(\b(http|https)://([-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]))"
    r2 = r"((^|\b)www\.([-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]))"
    contents = re.sub(r2,r'<a rel="nofollow" target="_blank" href="http://\1">\1</a>',re.sub(r1,r'<a rel="nofollow" target="_blank" href="\1">\1</a>',contents))

    title = hit.get("contents")[:100] + "..."
    path = hit.get("path")
    unique = md5(hit.get("name")).hexdigest()
    return {"contents":contents, "title":title, "path":path, 'unique':unique }
def run(searcher, parser):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query: ")
        if command == '':
            return

        print "Searching for:", command
        query = parser.parse(command)
        hits = searcher.search(query, Sort("population", True))
        print "%s total matching documents." % hits.length()

        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            print 'name:', doc.get("name"), ' state:', doc.get("state")
Exemple #10
0
def search(q):
    
    initLucene()
    fsDir = FSDirectory.getDirectory(INDEX_PATH, False)
    searcher = IndexSearcher(fsDir)
    query = QueryParser("contents", StandardAnalyzer()).parse(q)
    start = time()
    hits = searcher.search(query)
    duration = timedelta(seconds=time() - start)
    matchpages = []
    for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            temp = ({"title":doc["title"],"url":doc["url"]})
            matchpages.append(temp)
    result = {"no_of_hits":hits.length(),"duration":duration, "query":q,"result":matchpages}
    return result
Exemple #11
0
    def search(cls, indexDir, q):

        fsDir = FSDirectory.getDirectory(indexDir, False)
        searcher = IndexSearcher(fsDir)

        query = QueryParser("contents", StandardAnalyzer()).parse(q)
        start = time()
        hits = searcher.search(query)
        duration = timedelta(seconds=time() - start)

        # result = {"no_of_hits":hits.length(),"duration":duration, "query":q,}
        # return
        print "Found %d document(s) (in %s) that matched query '%s':" % (hits.length(), duration, q)

        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            print doc
Exemple #12
0
def find(terms):
    """Use the Lucene index to find monsters"""
    terms = ' '.join(terms)
    searcher = IndexSearcher(STORE)

    SHOULD = BooleanClause.Occur.SHOULD

    query = MultiFieldQueryParser.parse(terms, 
            ['name_', 'full_text'], [SHOULD, SHOULD], StandardAnalyzer())
    hits = searcher.search(query)

    ret = []
    for i, hit in enumerate(hits):
        doc = Hit.cast_(hit).getDocument()
        ret.append(MyHit(doc, hits, i))
        if i == 10:
            break

    return ret
Exemple #13
0
    def purgeDocuments(self, txn, counter, indexSearcher, indexReader,
                       uItem, toVersion=None):

        term = Term("item", uItem.str64())

        if toVersion is None:
            counter.documentCount += indexReader.deleteDocuments(term)

        else:
            x, keep = self.store._items.findValues(None, toVersion,
                                                   uItem, None, True)
            keep = set(keep)

            for hit in indexSearcher.search(TermQuery(term)):
                hit = Hit.cast_(hit)

                doc = hit.getDocument()
                ver = long(doc['version'])
                if ver <= toVersion and UUID(doc['value']) not in keep:
                    indexReader.deleteDocument(hit.getId())
                    counter.documentCount += 1
Exemple #14
0
def a():
    import os
    #loader = BSDDictLoader()
    #dic = loader.load()
    words_dict = {}
    from dot.searcher import Searcher, STORE_DIR
    from apps.wantown import dao
    from apps.wantown.models import Entry
    searcher = Searcher()
    hits = searcher.search("java")
    docs = []
    for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            docs.append(doc)
    entries = []
    all = ''
    

    from dot.context import Context, Token
    context = Context()
    import re
    #all = re.sub('[0-9:;;/\(\)\t\[\]]()\**#&','',all)
    #all = re.sub('[ +=-]',' ',all)

    analyzer = StandardAnalyzer()
    # doc id
    id = 0
    allToken = []
    allText = []
    pureText = ''
    c = 0
    docRange = {}
    for doc in docs[0:100]:
        link = doc.get("link")
        entry = dao.get_by_link(link, Entry)
        entries.append(entry.summary)
        all = entry.summary[:200] + entry.title
        pureText += all
        tokenType = []
        last_type = ''
        #all = """提起电吉他演奏,就必须提到布鲁斯音乐;提起最伟大的吉他演奏大师,人们首先会想到的是 Jimi Hendrix,但是说起依然在世的最伟大的吉他演奏家,名字只有一个——Eric Clapton爵士。自从上个世纪60年代布鲁斯摇滚乐以及布鲁斯吉他演奏成为了主流摇滚风格之后,在这种来源于黑人音乐的吉他演奏中,在所有除黑色外其他肤色的布鲁斯吉他演奏家之中,传奇人物Eric Clapton毫无疑问是其中最杰出的一位。在与Eric Clapton同时代的所有艺术家纷纷的离开人世,或者失去了原有的歌迷号召力之后,Eric Clapton是所有当年这些艺术家中为数不多的既然保持着自己高超的演奏技术以及强大的市场号召力的艺术家。
#  
#Eric Clapton为人谦逊,在与其他出色的吉他演奏者比如Jimi Hendrix,B.B. King,Duane Allman,甚至后辈Stevie Ray Vaughan相比较的时候他总是非常谦恭,在与B.B. King以及Bob Dylan等人同台的时候他总是举止非常礼让,他是最有绅士风度的流行音乐家之一。同时,作为世界上最著名的吉他大师,Eric Clapton还经常热心的帮助包括英国著名流行音乐家Sting,Bon Jovi乐队主音吉他手Richie Sambora在内的其他一些音乐家去录制专辑或者拍摄音乐录影带,并且经常为一些音乐家担任吉他手作伴奏。Eric Clapton曾经协助过Bob Dylan,Aretha Franklin,Joe Cocker,Ringo Starr,Freddie King,Roger Waters等等近百位艺术家的专辑录制。   
#"""
        stream = analyzer.tokenStream("fieldname", StringReader(all))    
        for s in stream:
            
            #if (last_type == '<ALPHANUM>' or last_type == '<HOST>') and (s.type() == '<ALPHANUM>' or s.type() == '<HOST>'):
                #all.append(' ')
                #pass
            #last_type = s.type()
            token = Token()
            token.text = s.termText()
            token.offset = s.termLength()
            token.doc = id
            allToken.append(token)
            allText.append(s.term())
            print dir(s)
            c += 1
        docRange[len(allText)] = id
        #all = sorted(all,cmp=lambda x,y:cmp(x.termText(),y.termText()))
        id += 1
    context.tokens = allText
    
    #context.tokens.sort()
    #for i in context.tokens:
        #print i
    
    #print s
    
    context.text = ''
    context.token_types = tokenType
    context.docs = entries
    context.term_doc_range = docRange
    print len(all) 
    from dot.lingo import pextractor
    import time
    start = time.time()
    #pe = pextractor.PhraseExtractor()
    #results = pe.extract(context)
    count = 0
    r = docRange.keys()
    r.sort()
    if 0:
        for i in results:
            if len(i.text) > 1 and i.freq > 2 and len(i.text) < 20:
                id = i.id - 1
                lcp = context.lcp[id + 1]
                for f in range(i.freq):
                    begin = context.suffix[id]  
                    end = context.suffix[id] + lcp
    
                    for j in range(len(r)):
                        if begin < r[j]:
                            break
                    doc = docRange[r[j]]
                    #print context.tokens[begin:end],i.freq,begin,doc
                    if end > r[j]:
                        print 'not in the same doc'
                    id += 1
                #print  i.text.strip(), i.freq,i.doc_freq
    #print (time.time() - start)
    from dot.matrixmapper import MatrixMapper
    mapper = MatrixMapper()
    mapper.build(docs[:100])
   
    #print pureText
    import sys
    from dot.lingo import suffixsorter as ss
    #for i in range(len(context.suffix)):
     #   s = pe.list2str(context.tokens)
      #  sys.stdout.write('%d\t%d\t%s\n' % (context.suffix[i], context.lcp[i], context.tokens[context.suffix[i]:context.suffix[i] + 10]))
    #dm = getDictManager()
    #words_dict= featurex.tf_idf(entries, dm.seg_dict)
    #doc1 = featurex.Document(entries.encode('utf-8'),dm)
    #doc2 = featurex.Document(entries[0].encode('utf-8'), dm)
    #for i in words_dict.values():
        #print i.word,i.frequency,i.feature_value,i.tfidf
    #print similitude_doc_cos(doc1, doc2)
    """
ibm jdk 3 {3: 3}
不同 3 {4: 2, 7: 1}
使用 3 {8: 2, 7: 1}
可以 10 {8: 3, 3: 2, 4: 2, 7: 3}
处理 3 {8: 3}
好的 3 {8: 1, 7: 2}
字体 5 {8: 2, 4: 3}
已经 4 {9: 1, 3: 1, 4: 1, 7: 1}
平滑 4 {8: 1, 4: 3}
应用 3 {8: 1, 4: 2}
手机上 3 {7: 3}
文本 3 {8: 3}
游戏 4 {7: 4}
环境 3 {1: 1, 3: 2}
的java 6 {1: 1, 2: 1, 5: 1, 7: 3}
的文 3 {8: 3}
设置 5 {4: 5}
软件 3 {1: 1, 7: 2}
运行 3 {1: 1, 7: 2}

"""
    
    
Exemple #15
0
Fichier : dao.py Projet : fay/wt
def query(query, page,category_what,data_size=200,nobuildcategory=False):
    category_id = None
    if category_what:
        category_ = dao.Category.objects.filter(what=category_what)[0]
        category_id = category_.id
    
    
    hits = searcher.search(query,category_id)
    
    doc_ids = []
    for i in range(len(hits)):
        doc_ids.append(hits.id(i))
    #这里将空格替换为+号,否则会报错,对应地在catfilter中从cache中值时也要将query的空格替换为+号
    cache.add(query.replace(' ','+'),doc_ids,3600)
    #相关类目,暂不使用
    #cats = dao.get_keywords(query)
    results = []
    scores = []
    #last page number
    total = hits.length()
    pages_num = total / PAGE_SIZE + (total % PAGE_SIZE and 1) or 0
    if ((page - 1) * PAGE_SIZE) > total :
        page = pages_num
    docs = []
    for i in range(PAGE_SIZE):
        start = (page - 1) * PAGE_SIZE
        if start + i >= total:
            break

        doc = hits.doc(i + start)
        docs.append(doc)
        link = doc.get("link")
        entry = dao.get_by_link(link, Entry)
        if entry:
            entry.summary = entry.summary[0:data_size] + "..."
            results.append(entry)
            scores.append(hits.score(i + (page - 1) * PAGE_SIZE))
        
    if 0:
        for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            link = doc.get("link")
            entry = dao.get_by_link(link, Entry)
            if entry:
                entry.summary = entry.summary[0:200] + "..."
                results.append(entry)
                scores.append(Hit.cast_(hit).getScore())
                
    dispCats = dao.QueryCategoryDisp.objects.filter(query__keyword=query) 
    label = []
    if dispCats:
        for cat in dispCats:
            qec=dao.QueryEntryCategory.objects.filter(query__keyword=query,category=cat.category)
            label.append([cat.weight,cat.category.what,len(qec)])
        label.sort(reverse=True)
    phrases,label_doc = (dispCats and ({},[])) or discover_freq_phrases(docs,query)
    
    #for i in range(len(docs)):
        #raw_cat = results[i].category.what
        #if raw_cat == u'其他' and phrases[i].label_weight:
         #   results[i].category.what = phrases[i].text
            
    return results, scores,total,phrases,dispCats and label[:10] or label_doc[:10]