Esempio n. 1
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
Esempio n. 2
0
    def post(self):
      q= self.get_argument("query")

      # self.write(key)

    # def query(query):
      # query = self.get_argument("q")
      lucene.initVM()
      indexDir = "index"
      dir = SimpleFSDirectory(File(indexDir))
      analyzer = StandardAnalyzer(Version.LUCENE_30)
      searcher = IndexSearcher(dir)
      
      query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
      MAX = 10
      hits = searcher.search(query, MAX)
      
      print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
      items = []
      rQ = []
      
      #for key, value in doc_urls.iteritems() 
       # print (key, value)

      for hit in hits.scoreDocs:
          #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
          print hit.score, hit.doc, hit.toString()
          print(len(doc_urls))
          items.append(doc_urls[str(hit.doc)])
          doc = searcher.doc(hit.doc) 
          print(hit.doc)
        
      self.render("index.html", title="Results", items=items, query=q)
Esempio n. 3
0
def retrieveDocs(q):
    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    reader = IndexReader.open(SimpleFSDirectory(File("index/")))
    searcher = IndexSearcher(reader)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
    MAX = 1000
    hits = searcher.search(query, MAX)
    nonDiverse = []
    docsToScores = {}
    #create a list of html files with relevant websites
    rQ = []
    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        print doc.get("text").encode("utf-8")
        #print(new_urls[str(hit.doc)])
        result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString()
        if (len(nonDiverse) < 10):
            nonDiverse.append(new_urls[str(hit.doc)])
        #find the document that corresponds to the html website and append to a list for min distance
        website = new_urls[str(hit.doc)]
        #html_files numbers of the hit websites added to rQ
        rQ.append(inv_map[website])
        docsToScores[int(inv_map[website])] = hit.score
        print(inv_map[website])
    return docsToScores, rQ, nonDiverse
Esempio n. 4
0
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Esempio n. 5
0
 def fn():
     env.attachCurrentThread()
     start = datetime.now()
     IndexFiles(sys.argv[1], "index",
                StandardAnalyzer(Version.LUCENE_CURRENT))
     end = datetime.now()
     print end - start
Esempio n. 6
0
 def __init__(self):
     self.__dict__ = self.__shared_state
     if not self.__shared_state:
         self.jccvm = lucene.initVM()
         self.index = SimpleFSDirectory(
             lucene.File(settings.lucene_index_dir))
         self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
Esempio n. 7
0
def luceneRetriver(query):

    lucene.initVM()

    indir = SimpleFSDirectory(File(INDEXDIR))

    lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

    lucene_searcher = IndexSearcher(indir)

    my_query= QueryParser(Version.LUCENE_30,"text",\

    lucene_analyzer).parse(query)

    MAX = 1000

    total_hits = lucene_searcher.search(my_query, MAX)

    print "Hits: ", total_hits.totalHits

    for hit in total_hits.scoreDocs:

        print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString(
        )

        doc = lucene_searcher.doc(hit.doc)

        print doc.get("text").encode("utf-8")
Esempio n. 8
0
    def testPrefixQuery(self):

        parser = QueryParser(Version.LUCENE_CURRENT, "category",
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        parser.setLowercaseExpandedTerms(False)

        print parser.parse("/Computers/technology*").toString("category")
Esempio n. 9
0
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
Esempio n. 10
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Esempio n. 11
0
def names():
    lst = []

    search = "spax"  #request.form['product']
    lucene.initVM()

    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text",
                        analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
        return jsonify(data)
Esempio n. 12
0
def index_files(files, index_directory):
    lucene.initVM()
    d = SimpleFSDirectory(File(index_directory))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
    for f in files:
        parse_file(f, writer)
    writer.optimize()
    writer.close()
    def open(self, name, txn, **kwds):

        super(IndexContainer, self).open(name, txn, **kwds)

        if kwds.get('create', False):
            directory = self.getDirectory()
            indexWriter = IndexWriter(directory, StandardAnalyzer(), True)
            indexWriter.close()
            directory.close()
def retrieve(string,tweetID):
 global eventNum
 global eventDict
 global eventList
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  searcher = IndexSearcher(dir)
 except lucene.JavaError:
  #print 'Inside First Except'
  eventDict[tweetID] = eventNum
  eventNum = eventNum + 1
  analyzer.close()
  return
 try:
  query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string)
#e = sys.exc_info()[0]
#print e
  MAX = 2
  hits = searcher.search(query, MAX)
  #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
  #print 'total hits'
  #print hits.totalHits
  if hits.totalHits > 0:
	eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc]
  	analyzer.close()
  	searcher.close()
	return
  else:
	#print '-----------'
	#print tweetID
  	eventDict[tweetID] = eventNum
	eventNum = eventNum + 1
  	analyzer.close()
  	searcher.close()
	return

  #for hit in hits.scoreDocs:
#print hit.score, hit.doc, hit.toString()
      #doc = searcher.doc(hit.doc)
#print doc.get("text").encode("utf-8")
 except lucene.JavaError:
  eventDict[tweetID] = eventNum
  eventNum = eventNum + 1
  analyzer.close()
  searcher.close()
  return
Esempio n. 15
0
def extractFeatureQueryWords(query):
    import string
    from lucene import Document, TermQuery, Term

    # create analyzer
    aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    try:
        file = open('../features.txt', 'r')

        featurelist = []
        for line in file.readlines():
            words_in_line = line.split()
            featurelist += words_in_line

        querywordlist = query.split()

        featureQueryList = []
        productQueryList = []

        for word in querywordlist:
            if word in featurelist:
                featureQueryList.append(word)
            else:
                # create parser for word
                aux_parser = QueryParser(Version.LUCENE_CURRENT, "title",
                                         aux_analyzer)
                aux_query = aux_parser.parse(word)
                scoreDocs = searcher.search(aux_query, 50).scoreDocs
                if scoreDocs:
                    productQueryList.append(word)

        featureQuery = ""
        if featureQueryList:
            featureQuery = "("
            for i in range(len(featureQueryList)):
                if i == len(featureQueryList) - 1:
                    featureQuery += featureQueryList[i] + ")"
                else:
                    featureQuery += featureQueryList[i] + " AND "

            print featureQuery

        productQuery = ""
        if productQueryList:
            productQuery = "("
            for i in range(len(productQueryList)):
                if i == len(productQueryList) - 1:
                    productQuery += productQueryList[i] + ")"
                else:
                    productQuery += productQueryList[i] + " AND "

        return (featureQuery, productQuery, featureQueryList, productQueryList)
    except Exception, ex:
        print "Could not separate feature query words. Reason: ", ex
        return ("", "(" + query + ")", [], querywordlist)
Esempio n. 16
0
    def testPhraseQuery(self):

        analyzer = StandardAnalyzer(Version.LUCENE_24)
        q = QueryParser(Version.LUCENE_24, "field",
                        analyzer).parse('"This is Some Phrase*"')
        self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed")

        q = QueryParser(Version.LUCENE_CURRENT, "field",
                        self.analyzer).parse('"term"')
        self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
Esempio n. 17
0
def search(command):
    STORE_DIR = "index"
    vm_env = initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    result = run(searcher, analyzer, command)
    searcher.close()
    return result
def SearchFiles(command):
    STORE_DIR = "lucene/index"
    getVMEnv().attachCurrentThread()
    # print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    rankedfiles = run(searcher, analyzer, command)
    searcher.close()
    return rankedfiles
Esempio n. 19
0
def getResultScoreDocs(query):
    # create analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # create parser for user submitted query
    parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
    parser.setDefaultOperator(QueryParser.Operator.AND)
    formatted_query = parser.parse(query)
    scoreDocs = searcher.search(formatted_query, 50).scoreDocs

    return scoreDocs
Esempio n. 20
0
    def main(cls, argv):

        print "SimpleAnalyzer"
        cls.displayTokensWithFullDetails(SimpleAnalyzer(),
                                         "The quick brown fox....")

        print "\n----"
        print "StandardAnalyzer"
        cls.displayTokensWithFullDetails(
            StandardAnalyzer(Version.LUCENE_CURRENT),
            "I'll e-mail you at [email protected]")
    def commitIndexWriter(self, writer):

        directory = writer.getDirectory()
        writer.close()
        dbDirectory = self.getDirectory()
        dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False)
        dbWriter.setUseCompoundFile(False)
        dbWriter.addIndexes([directory])
        directory.close()
        dbWriter.close()
        dbDirectory.close()
Esempio n. 22
0
    def testWithQueryParser(self):

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            self.synonymAnalyzer).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        # in Lucene 1.9, position increments are no longer ignored
        self.assertEqual(1, topDocs.totalHits, "!!!! what?!")

        query = QueryParser(Version.LUCENE_CURRENT, "content",
                            StandardAnalyzer(
                                Version.LUCENE_CURRENT)).parse('"fox jumps"')
        topDocs = self.searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "*whew*")
Esempio n. 23
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(indexDir,
                             StandardAnalyzer(Version.LUCENE_CURRENT), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()
Esempio n. 25
0
def query(query):
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits,
                                                             query)
    for hit in hits.scoreDocs:
        print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)]
        doc = searcher.doc(hit.doc)
Esempio n. 26
0
def build_perfield_analyzer(index_fields):
    """
    This function creates a PerFieldAnalyzerWrapper which allows us 
    to associate different Lucene Analyzers to specific fields in our 
    Lucene index. 
    """
    analyzer = PerFieldAnalyzerWrapper(StandardAnalyzer(
        Version.LUCENE_CURRENT))

    for (index_name, analyzer_type) in index_fields.iteritems():
        if analyzer_type == "standard":
            continue

        analyzer.addAnalyzer(index_name, ANALYZER_LOOKUP.get(analyzer_type))

    return analyzer
Esempio n. 27
0
    def search(cls, indexDir, q):

        fsDir = SimpleFSDirectory(File(indexDir))
        searcher = IndexSearcher(fsDir, True)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q)
        start = time()
        hits = searcher.search(query, 50).scoreDocs
        duration = timedelta(seconds=time() - start)

        print "Found %d document(s) (in %s) that matched query '%s':" % (
            len(hits), duration, q)

        for hit in hits:
            doc = searcher.doc(hit.doc)
            print 'path:', doc.get("path")
Esempio n. 28
0
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (
                dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Esempio n. 29
0
    def find(self, query, indir):
        lucene.initVM()
        INDEXDIR = indir

        indir = SimpleFSDirectory(File(INDEXDIR))
        lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)
        lucene_searcher = IndexSearcher(indir)
        my_query = QueryParser(Version.LUCENE_30,"<default field>",\
        lucene_analyzer).parse("text:" + query + " OR title:" + query)
        MAX = 1000
        total_hits = lucene_searcher.search(my_query, MAX)
        print "\nHits: ", total_hits.totalHits, "\n"

        for hit in total_hits.scoreDocs:
            print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc(
                hit.doc).get("department").encode(
                    "utf-8"), "Title:", lucene_searcher.doc(
                        hit.doc).get("title").encode("utf-8")
            print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
Esempio n. 30
0
def luceneIndexer(docdir, indir):
    """

         IndexDocuments from a directory

         """

    lucene.initVM()

    DIRTOINDEX = docdir

    INDEXIDR = indir

    indexdir = SimpleFSDirectory(File(INDEXIDR))

    analyzer = StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))

    for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')):

        print "Indexing: ", tfile

        document = Document()

        content = open(tfile, 'r').read()

        document.add(Field("text",content,Field.Store.YES,\

                 Field.Index.ANALYZED))

        index_writer.addDocument(document)

        print "Done: ", tfile

    index_writer.optimize()

    print index_writer.numDocs()

    index_writer.close()
Esempio n. 31
0
    def someMethod(self):

        directory = RAMDirectory()

        analyzer = StandardAnalyzer()
        writer = IndexWriter(directory, analyzer, True)

        doc = Document()
        doc.add(Field.Text("title", "This is the title"))
        doc.add(Field.UnStored("contents", "...document contents..."))
        writer.addDocument(doc)

        writer.addDocument(doc, analyzer)

        expression = "some query"

        query = QueryParser.parse(expression, "contents", analyzer)

        parser = QueryParser("contents", analyzer)
        query = parser.parseQuery(expression)
Esempio n. 32
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()
Esempio n. 33
0
 def __init__(self, STOP_WORDS=StopAnalyzer.ENGLISH_STOP_WORDS):
     self.pe = pextractor.PhraseExtractor()
     self.STOP_WORDS = STOP_WORDS
     self.analyzer = StandardAnalyzer(STOP_WORDS)
     self.entries = []
Esempio n. 34
0
class MatrixMapper(object):
    def __init__(self, STOP_WORDS=StopAnalyzer.ENGLISH_STOP_WORDS):
        self.pe = pextractor.PhraseExtractor()
        self.STOP_WORDS = STOP_WORDS
        self.analyzer = StandardAnalyzer(STOP_WORDS)
        self.entries = []
    def get_cs_by_lucene_doc(self, docs, context):
        doc_size = len(docs)
        lucene_ids = []
        categories = []
        for id in range(doc_size):
            link = docs[id].get("link")
            lucene_ids.append(int(docs[id].get("id")))
            entry = dao.get_by_link(link, Entry)
            self.entries.append(entry)
            # TODO boost title field
            summary = entry.summary[:200]
            #if entry.category != '其他':
                #categories.append(entry.category)
            stream = self.analyzer.tokenStream("summary", StringReader(summary))
            for s in stream:
                context.tokens.append(s.term())
                context.token_types.append(s.type())
            stream = self.analyzer.tokenStream("title", StringReader(entry.title))
            for s in stream:
                context.title_field.append(len(context.tokens))
                context.tokens.append(s.term())
                context.token_types.append(s.type())
            context.term_doc_range.append(len(context.tokens))
        #print 'tokens:',len(context.tokens)
        return self.pe.extract(context), lucene_ids, categories
    
    def add2matrix(self, tpv, all, term_row, lucene_ids, i,term_doc_freq):
        for (t, f) in zip(tpv.getTerms(), tpv.getTermFrequencies()):
            term = [0 for j in range(len(lucene_ids))]
            new = False
            if not term_row.has_key(t):
                term_row[t] = len(term_row)
                new = True
            row = term_row[t]
            if new:
                term[i] = f#self.tfidf(len(tpv.getTerms()), f, total_terms, dtf)
                all.append(term)
            else:
                all[row][i] = f
            term_doc_freq[t] = term_doc_freq.get(t,0) + 1
    """
        效率很低,弃用           
    """ 
    def product(self, termmatrix, labelmatrix):
        row = []            
        for i in range(len(termmatrix)):
            col = []
            for j in range(len(labelmatrix[0])):
                #k是a的列,b的行
                e = []
                for k in range(len(termmatrix[0])):
                    c = termmatrix[i][k]                    
                    d = labelmatrix[k][j]
                    e.append(c * d)
                col.append(sum(e))
            row.append(col)   
        return row
    """
         取代label_assign方法
    """
    def assign(self, docs, labels, lucene_ids):
        term_row = {}
        all = []
        ireader = IndexReader.open(STORE_DIR)
        total_terms = 0
        term_doc_freq = {}
        for i in range(len(lucene_ids)):
            tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
            self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
            """
                TODO:给属于标题的term加权
            """
            tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
            self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq)
        #for k,v in term_doc_freq.items():
         #   if v> 3:
          #      print k,v
        # 对label进行分词            
        analyzer = CJKAnalyzer()
        labelmatrix = zeros((len(all), len(labels)))
        label_term = []
        # doc -label:每个doc对应的label
        all_weight_table = {}
        #label -doc:每个label对应的doc
        label_doc = []
        label_doc_map = {}
        for i in range(len(labels)):
            nonzero_table = []
            # 一个label对应和所有doc的权重之积
            weight_table = []
            
            stream = analyzer.tokenStream('', StringReader(labels[i].text))
            terms = []            
            c = 0
            weight_row = {}
            nonzero_index = []  
            is_incomplete = False
            for token in stream:
                term = token.term()#token.decode('utf-8')#
                #print term
                if term_row.has_key(term):
                    row = term_row[term]
                    terms.append(term)
                    docs_with_current_term = all[row]
                    for j in range(len(docs_with_current_term)):
                        if docs_with_current_term[j] != 0:                                            
                            if c == 0:
                                nonzero_index.append(j)
                            if c == 0 or j in nonzero_index:
                                weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight 
                            else:
                                # 加1防止权重之积为0
                                # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc  ,乘以-100使得权重乘积最小表示当前label不适用于此doc                              
                                weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100)
                        # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc 
                        elif docs_with_current_term[j] == 0 and j in nonzero_index:
                            # 加1防止权重之积为0
                            weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100)
                    c += 1
                else:
                    is_incomplete = True
            label_term.append(terms)
            # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。
            if is_incomplete:
                weight_row = {}
                    
                    
            for doc, weight in weight_row.items():
                last = all_weight_table.get(doc)                
                if weight > 0:
                    if not label_doc_map.has_key(labels[i].text):    
                        kc = dao.get_keyword_category_by_category(self.query, labels[i].text)
                        #label_doc.append([ 0,labels[i].text,[]])
                        label_doc.append([ 0,labels[i].text,0])
                        label_doc_map[labels[i].text] = len(label_doc) - 1
                    new_label = pextractor.Substring()
                    new_label.text = labels[i].text
                    new_label.id = weight
                    if last:
                        all_weight_table[doc].append(new_label)
                    else:
                        all_weight_table[doc] = [new_label]
                    #label_doc[label_doc_map[labels[i].text]][2].append(doc)
                    label_doc[label_doc_map[labels[i].text]][2] += 1
                    label_doc[label_doc_map[labels[i].text]][0] += weight
                    
                    #try:
                     #   category = dao.save_category(labels[i].text, weight, 'd')
                      #  entry = self.entries[doc]
                       # ec = dao.save_entry_cat(entry, category, weight)
                    #except Exception,e:
                     #   print e
                    
                    #if last:
                     #   all_weight_table[doc].append(ec)
                    #else:
                     #   all_weight_table[doc] = [ec]
                # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label
                #if last:
                #    if last.id < weight and weight > 0:
                 #       labels[i].id = weight
                  #      all_weight_table[doc] = labels[i]
                #else:
                 #   labels[i].id = weight
                  #  all_weight_table[doc] = labels[i]
        label_doc.sort(reverse=True)
        for k, v in all_weight_table.items():
            v.sort(reverse=True)
                
        # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了
        thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query)
        thread.start()
        return all_weight_table,label_doc
            
    """
        废弃
    """
    def label_assign(self, docs, labels, lucene_ids):
        term_row = {}
        all = []
        ireader = IndexReader.open(STORE_DIR)
        total_terms = 0
        for i in range(len(lucene_ids)):
            tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary'))
            self.add2matrix(tpv, all, term_row, lucene_ids, i)
            tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title'))
            self.add2matrix(tpv, all, term_row, lucene_ids, i)
        
        # 对label进行分词            
        analyzer = CJKAnalyzer()
        labelmatrix = zeros((len(all), len(labels)))
        label_term = []
        for i in range(len(labels)):
            if not labels[i].is_candicate_label and len(labels[i].text) >= 3:
                label_term.append([])
                continue
            #print labels[i].text,labels[i].id
            stream = analyzer.tokenStream('', StringReader(labels[i].text))
            terms = []
            for token in stream:
                if term_row.has_key(token.term()):
                    # weighting
                    termdocs = ireader.termDocs(Term('summary', token.term()))
                    count = 0
                    span = 0
                    terms.append(token.term())
                    while termdocs.next():
                        count += termdocs.freq()
                        span += 1
                    weight = labels[i].label_weight
                    #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()):
                        #weight = 0
                    labelmatrix[term_row[token.term()]][i] = weight
            label_term.append(terms)
        termmatrix = array(all)
        termmatrix = transpose(termmatrix)
        #for i in range(len(labelmatrix[0])):
            #for j in range(len(termmatrix[0])):
        
        # row是doc,col是label  
        #p = self.product(termmatrix,labelmatrix)
        d = dot(termmatrix, labelmatrix)
        result = d / (norm(labelmatrix) * norm(termmatrix))
        doc_label = []
        for i in range(len(result)):
            m = - 1
            index = - 1
            group = []
            for j in range(len(result[i])):
                if result[i][j] > 0:
                    labels[j].id = result[i][j]
                    group.append(labels[j])
            # substring是按照id来排序的,这里正好用到
            group.sort()
            group.reverse()
            max_label = group[0]
            # i:doc number(just occur position in the docs)
            # label id
            # label score
            # 如果label自身并没有出现在当前doc中
            if not max_label.doc_freq.has_key(i):
                #print 'oringial:',labels[index].text
                count = 0
                overlap = ''
                for k in label_term[index]:
                    if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0:
                        overlap = k
                        print k
                        count += 1
                # 至少有一个交集,并且长度大于等于2
                if count == 1 and len(overlap) >= 2 :
                    new_label = pextractor.Substring()
                    new_label.text = overlap
                    new_label.id = m
                    doc_label.append(group[0])
                    continue
                        
            #labels[index].id = m
            doc_label.append(group[0])
        return doc_label
    """
        计算tf-idf值
    """   
    def tfidf(self, terms_length, tf, docs_length, dtf):
        # results[i].freq - v可能为0,所以将v/2再相减
        return (tf / float(terms_length)) * abs(math.log((dtf - tf / 2.0) / (docs_length*10)))
    def build(self, docs,query):
        self.entries = []
        self.query = query
        context = Context()
        results, lucene_ids , categories = self.get_cs_by_lucene_doc(docs, context)
        results_size = len(results)
        doc_size = len(docs)
        stroplabels = {}
        for i in range(results_size):
            #删除一些类似‘在阿里巴巴’中的‘在’字,类似地还有‘...上’,‘...里’,‘和...’
            stream2 = tokenizer.tokenize(results[i].text.encode('utf-8'))
            if len(stream2) == 2 and len(results[i].text)>=3:     
                if (len(stream2[0].decode('utf-8')) == 1 or len(stream2[1].decode('utf-8')) == 1):
                    results[i].text = len(stream2[0].decode('utf-8')) == 1 and stream2[1].decode('utf-8') or stream2[0].decode('utf-8')
                elif (len(stream2[1].decode('utf-8')) - len(stream2[0].decode('utf-8'))) == 1 \
                        and len(stream2[1].decode('utf-8')) == len(results[i].text):
                    results[i].text = stream2[0].decode('utf-8')
            if stroplabels.has_key(results[i].text):
                same_label_name = stroplabels[results[i].text]
                results[i].freq += same_label_name.freq
                for k,v in same_label_name.doc_freq.items():                 
                    if results[i].doc_freq.has_key(k):
                        results[i].doc_freq[k] += v
                    else:
                        results[i].doc_freq[k] = v
            stroplabels[results[i].text] = results[i]
            
        results = stroplabels.values()
        results_size = len(results)
        # phrase doc matrix
        pdmatrix = zeros((results_size, doc_size))
        #for i in results:
            #print i.text,i.freq
        d = len(context.tokens)
        if results_size == 0:
            print 'no frequent phrase!'
            return {},[]

        for i in range(results_size):
            doc_freq = results[i].doc_freq
            
            index = context.suffix[results[i].id]           
            # 如果有性能问题可以把title_field改为map
            is_title = False
            if index in context.title_field:
                is_title = True
            for k, v in doc_freq.items():
                # k is doc_id,v is doc freq
                # doc_length 所在doc的term数量
                doc_length = context.term_doc_range[k] - (k and context.term_doc_range[k - 1])                
                pdmatrix[i][k] = self.tfidf(doc_length, v, d, results[i].freq) * (is_title and TITLE_FIELD_BOOST or 1)
            results[i].label_weight = norm(pdmatrix[i])
            #if stroplabels.has_key(results[i].text):
               # stroplabels[results[i].text].label_weight+=results[i].label_weight
                
        if 1:
            # SVD-奇异值分解        
            u, s, v = linalg.svd(pdmatrix)
            
            # 试探以取得矩阵的秩
            rank = 1
            for k in range(1, len(s)):
                if norm(s[:k]) / norm(s) >= CLT:
                    rank = k
                    break
            
            # 取u中前rank列,选择最大的一个component,保存对应的label索引    
            maxes = {}
            lo = {}
            for j in range(rank):
                m = - 100
                index = - 1
                for i in range(len(u)): 
                    if u[i][j] > 0:
                        lo[i]=u[i][j]
                    if u[i][j] > m:
                        m = u[i][j]
                        # index 为 对应的label索引
                        index = i
                # 如果label之前已有则取均值
                if maxes.has_key(index):
                    maxes[index] = (maxes[index] + m) / 2.0
                else:
                    # 保存最大值
                    maxes[index] = m
    
            labels = []
            
            # k为索引,v为最大值
            for k, v in maxes.items():
                    #print results[k].text,k,v
                    #sub = pextractor.Substring()
                    #sub.text = results[k].text
                    temp = re.sub('[0-9 ]', '', results[k].text)
                    if temp == '':
                        results[k].text = temp
                    results[k].is_candicate_label = True
                    results[k].label_weight = v
                    labels.append(results[k])
                    
        return self.assign(docs, results, lucene_ids)
Esempio n. 35
0
File: cnanalyzer.py Progetto: fay/wt
def a():
    import os
    #loader = BSDDictLoader()
    #dic = loader.load()
    words_dict = {}
    from dot.searcher import Searcher, STORE_DIR
    from apps.wantown import dao
    from apps.wantown.models import Entry
    searcher = Searcher()
    hits = searcher.search("java")
    docs = []
    for hit in hits:
            doc = Hit.cast_(hit).getDocument()
            docs.append(doc)
    entries = []
    all = ''
    

    from dot.context import Context, Token
    context = Context()
    import re
    #all = re.sub('[0-9:;;/\(\)\t\[\]]()\**#&','',all)
    #all = re.sub('[ +=-]',' ',all)

    analyzer = StandardAnalyzer()
    # doc id
    id = 0
    allToken = []
    allText = []
    pureText = ''
    c = 0
    docRange = {}
    for doc in docs[0:100]:
        link = doc.get("link")
        entry = dao.get_by_link(link, Entry)
        entries.append(entry.summary)
        all = entry.summary[:200] + entry.title
        pureText += all
        tokenType = []
        last_type = ''
        #all = """提起电吉他演奏,就必须提到布鲁斯音乐;提起最伟大的吉他演奏大师,人们首先会想到的是 Jimi Hendrix,但是说起依然在世的最伟大的吉他演奏家,名字只有一个——Eric Clapton爵士。自从上个世纪60年代布鲁斯摇滚乐以及布鲁斯吉他演奏成为了主流摇滚风格之后,在这种来源于黑人音乐的吉他演奏中,在所有除黑色外其他肤色的布鲁斯吉他演奏家之中,传奇人物Eric Clapton毫无疑问是其中最杰出的一位。在与Eric Clapton同时代的所有艺术家纷纷的离开人世,或者失去了原有的歌迷号召力之后,Eric Clapton是所有当年这些艺术家中为数不多的既然保持着自己高超的演奏技术以及强大的市场号召力的艺术家。
#  
#Eric Clapton为人谦逊,在与其他出色的吉他演奏者比如Jimi Hendrix,B.B. King,Duane Allman,甚至后辈Stevie Ray Vaughan相比较的时候他总是非常谦恭,在与B.B. King以及Bob Dylan等人同台的时候他总是举止非常礼让,他是最有绅士风度的流行音乐家之一。同时,作为世界上最著名的吉他大师,Eric Clapton还经常热心的帮助包括英国著名流行音乐家Sting,Bon Jovi乐队主音吉他手Richie Sambora在内的其他一些音乐家去录制专辑或者拍摄音乐录影带,并且经常为一些音乐家担任吉他手作伴奏。Eric Clapton曾经协助过Bob Dylan,Aretha Franklin,Joe Cocker,Ringo Starr,Freddie King,Roger Waters等等近百位艺术家的专辑录制。   
#"""
        stream = analyzer.tokenStream("fieldname", StringReader(all))    
        for s in stream:
            
            #if (last_type == '<ALPHANUM>' or last_type == '<HOST>') and (s.type() == '<ALPHANUM>' or s.type() == '<HOST>'):
                #all.append(' ')
                #pass
            #last_type = s.type()
            token = Token()
            token.text = s.termText()
            token.offset = s.termLength()
            token.doc = id
            allToken.append(token)
            allText.append(s.term())
            print dir(s)
            c += 1
        docRange[len(allText)] = id
        #all = sorted(all,cmp=lambda x,y:cmp(x.termText(),y.termText()))
        id += 1
    context.tokens = allText
    
    #context.tokens.sort()
    #for i in context.tokens:
        #print i
    
    #print s
    
    context.text = ''
    context.token_types = tokenType
    context.docs = entries
    context.term_doc_range = docRange
    print len(all) 
    from dot.lingo import pextractor
    import time
    start = time.time()
    #pe = pextractor.PhraseExtractor()
    #results = pe.extract(context)
    count = 0
    r = docRange.keys()
    r.sort()
    if 0:
        for i in results:
            if len(i.text) > 1 and i.freq > 2 and len(i.text) < 20:
                id = i.id - 1
                lcp = context.lcp[id + 1]
                for f in range(i.freq):
                    begin = context.suffix[id]  
                    end = context.suffix[id] + lcp
    
                    for j in range(len(r)):
                        if begin < r[j]:
                            break
                    doc = docRange[r[j]]
                    #print context.tokens[begin:end],i.freq,begin,doc
                    if end > r[j]:
                        print 'not in the same doc'
                    id += 1
                #print  i.text.strip(), i.freq,i.doc_freq
    #print (time.time() - start)
    from dot.matrixmapper import MatrixMapper
    mapper = MatrixMapper()
    mapper.build(docs[:100])
   
    #print pureText
    import sys
    from dot.lingo import suffixsorter as ss
    #for i in range(len(context.suffix)):
     #   s = pe.list2str(context.tokens)
      #  sys.stdout.write('%d\t%d\t%s\n' % (context.suffix[i], context.lcp[i], context.tokens[context.suffix[i]:context.suffix[i] + 10]))
    #dm = getDictManager()
    #words_dict= featurex.tf_idf(entries, dm.seg_dict)
    #doc1 = featurex.Document(entries.encode('utf-8'),dm)
    #doc2 = featurex.Document(entries[0].encode('utf-8'), dm)
    #for i in words_dict.values():
        #print i.word,i.frequency,i.feature_value,i.tfidf
    #print similitude_doc_cos(doc1, doc2)
    """
ibm jdk 3 {3: 3}
不同 3 {4: 2, 7: 1}
使用 3 {8: 2, 7: 1}
可以 10 {8: 3, 3: 2, 4: 2, 7: 3}
处理 3 {8: 3}
好的 3 {8: 1, 7: 2}
字体 5 {8: 2, 4: 3}
已经 4 {9: 1, 3: 1, 4: 1, 7: 1}
平滑 4 {8: 1, 4: 3}
应用 3 {8: 1, 4: 2}
手机上 3 {7: 3}
文本 3 {8: 3}
游戏 4 {7: 4}
环境 3 {1: 1, 3: 2}
的java 6 {1: 1, 2: 1, 5: 1, 7: 3}
的文 3 {8: 3}
设置 5 {4: 5}
软件 3 {1: 1, 7: 2}
运行 3 {1: 1, 7: 2}

"""