def main1():
    print "retrieve and display files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = lucene.IndexSearcher(direc)
    search(searcher, analyzer)
    search2(searcher, analyzer)
Ejemplo n.º 2
0
    def getRecentConversations(self, username):
        #Determine index and data paths
        index_dir = self.indexdir + username
        data_dir = self.datadir + username

        #Load the index
        if os.path.isdir(index_dir) == True:
            luc_index = lucene.FSDirectory.getDirectory(index_dir)

            #Get the current time in UTC seconds
            curtime = int(time.time())

            #Convert to a search range
            searchstart = self.__padTimestamp(curtime - SECONDS_IN_20_MINUTES)
            searchend = self.__padTimestamp(MAX_TIMESTAMP)

            #Build and perform the query
            qtext = "timestamp:[" + searchstart + " TO " + searchend + "]"
            searcher = lucene.IndexSearcher(luc_index)
            qparser = lucene.QueryParser("text", lucene.StandardAnalyzer())
            query = qparser.parse(qtext)
            sortmethod = lucene.Sort(["protocol", "friend_chat", "timestamp"])
            qresults = searcher.search(query, sortmethod)

            #Fetch the results
            conversationlist = []
            for i in range(qresults.length()):
                mprotocol = qresults.doc(i).get("protocol")
                mfriend_chat = qresults.doc(i).get("friend_chat")
                mtimestamp = int(qresults.doc(i).get("timestamp"))
                mwho_sent = qresults.doc(i).get("who_sent")
                mfileoffset = int(qresults.doc(i).get("file_offset"))
                mrank = qresults.score(i)

                #This is a really bad and slow method that should
                #be optimized at a later date.
                #Simply search through all previously retrieved
                #conversations and check for a match. If match is
                #found, add it, otherwise create a new conversation.
                messagetext = self.__getMessageFromFile(
                    username, mfriend_chat, mprotocol, mfileoffset)
                message = LogMessage(messagetext, mtimestamp, mwho_sent)
                message.setRank(mrank)

                found = False
                for j in range(len(conversationlist)):
                    if conversationlist[j].getProtocol() == mprotocol and \
                       conversationlist[j].getFriendChat() == mfriend_chat:
                        found = True
                        conversationlist[j].addMessage(message)
                        break
                if found == False:
                    conversation = LogConversation(mprotocol, mfriend_chat)
                    conversation.addMessage(message)
                    conversationlist.append(conversation)

            return conversationlist
        else:
            #Index does not exist
            return False
Ejemplo n.º 3
0
    def search(self, restrictions, destination):
        """ 
        @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} 
        
        E1101: Pylint cannot detect the internals of the modules solr and lucene. 
        """
        # pylint: disable=E1101

        results = list()
        queryString = search_restriction_mapping.mapSearchRestriction(
            restrictions)
        if self._configuration.luceneIndexUri.startswith("file:///"):
            try:
                self._configuration.env.attachCurrentThread()
                indexDir = lucene.SimpleFSDirectory(
                    lucene.File(
                        self._configuration.luceneIndexUri.replace(
                            "file:///", "")))
                analyzer = lucene.StandardAnalyzer(
                    lucene.Version.LUCENE_CURRENT)
                searcher = lucene.IndexSearcher(indexDir)
                query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT,
                                           "content",
                                           analyzer).parse(queryString)
                hits = searcher.search(query, constants.MAX_RESULTS)
                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    results.append("/%s" % urllib.unquote(
                        doc.get(constants.FILEPATH_FIELD).encode("utf-8")))
                searcher.close()
            except Exception, error:
                errorMessage = "Cannot search items. Reason: '%s'" % error
                raise PersistenceError(errorMessage)
Ejemplo n.º 4
0
	def __init__(self, blogDir, indexDir):
		"초기화 작업"
		lucene.initVM(lucene.CLASSPATH)		# JVM을 초기화합니다.
    
		self.blogDir = blogDir
		self.indexDir = indexDir
		self.analyzer = lucene.StandardAnalyzer()
		self.store = lucene.FSDirectory.getDirectory(self.indexDir)
Ejemplo n.º 5
0
 def __init__(self, rows=None):
     #lucene.initVM()
     # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용
     vm_env = lucene.getVMEnv()
     if vm_env == None:
         lucene.initVM()
     else:
         vm_env.attachCurrentThread()
     self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30)
     self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY))
     self.rows = rows
Ejemplo n.º 6
0
    def __init__(self, session, config, parent):
        IndexStore.__init__(self, session, config, parent)
        path = self.get_path(session, 'defaultPath')
        self.analyzer = NullC3Analyzer()
        self.dir = lucene.FSDirectory.getDirectory(path, False)
        self.parser = lucene.QueryParser("", lucene.StandardAnalyzer())
        self.searcher = lucene.IndexSearcher(self.dir)

        self.writer = None
        self.currDoc = None
        self.currRec = None
Ejemplo n.º 7
0
def index_files(board, time_delta):
    store = lucene.SimpleFSDirectory(
        lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX))
    writer = lucene.IndexWriter(
        store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True,
        lucene.IndexWriter.MaxFieldLength.UNLIMITED)
    #  writer.setMaxFieldLength(1048576) # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = lucene.Document()
            doc.add(
                lucene.Field("name", filename, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("owner", owner, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("title", title, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("contents", contents, lucene.Field.Store.NO,
                             lucene.Field.Index.ANALYZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
Ejemplo n.º 8
0
def SearchKeyword(indexDir, keyword):
	directory = lucene.FSDirectory.getDirectory(indexDir)
	searcher = lucene.IndexSearcher(directory)		# 인덱스 검색 객체
	analyzer = lucene.StandardAnalyzer()

	print ("Searching for %s" % keyword)
	keyword = keyword.decode('cp949').encode('utf-8')
	queryParser = lucene.QueryParser('content', analyzer)				# 질의 생성
	query = queryParser.parse(keyword)
	
	hits = searcher.search(query)					# 검색 수행
	print ("%s matching documents" % hits.length())	# 결과 갯수

	for h in hits:									# 결과 출력
		doc = lucene.Hit.cast_(h).getDocument()
		print("Path: %s, name: %s" % (doc.get("path"), doc.get("name")))

	searcher.close()
Ejemplo n.º 9
0
def main1():
    print "started indexing sample files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = lucene.IndexWriter(direc, config)

    #fix this later.....FieldType not defined
    #field_type=lucene.FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    file1 = open("nitin.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    file1 = open("nitin2.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    writer.optimize()
    print "Indexed and optimized %d documents" % writer.numDocs()
    writer.close()
Ejemplo n.º 10
0
def	IndexCreate(fileDir, indexDir):
	analyzer = lucene.StandardAnalyzer()	# 루씬에서 사용하는 객체 생성
	store = lucene.FSDirectory.getDirectory(indexDir)
	writer = lucene.IndexWriter(store, analyzer)

	for root, dirnames, filenames in os.walk(fileDir):	# 입력받은 폴더에서 텍스트 파일만 검색
		for filename in filenames:
			if not filename.endswith('.txt'):
				continue
			
			print("Adding: %s" % filename)
			try:
				path = os.path.join(root, filename)
				f = open(path)
				content = f.read()
				f.close()

				content = content.decode('cp949').encode('utf-8')	# 인코딩을 'utf-8'로 변경

				doc = lucene.Document()				# Document 객체 추가
				doc.add(lucene.Field(	"name", 	# 파일명
										filename,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				doc.add(lucene.Field(	"path", 	# 파일 경로
										path,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				if len(content) > 0:
					doc.add(lucene.Field(	"content", 		# 파일 내용
											content,
											lucene.Field.Store.NO,
											lucene.Field.Index.TOKENIZED))
				else:
					print("Warning: No contents in %s" % filename)
				writer.addDocument(doc)				# 인덱스에 Document 추가
			except Exception, e:
				print("Failed in adding index: %s" % e)
Ejemplo n.º 11
0
                                         lucene.Field.Store.YES,
                                         lucene.Field.Index.NOT_ANALYZED))
                    if len(contents) > 0:
                        doc.add(lucene.Field("contents", contents,
                                             lucene.Field.Store.NO,
                                             lucene.Field.Index.ANALYZED))
                        #将不同的Field加入到文档中。一篇文档有多种信息,如题目,作者,修改时间,内容等。
                        #不同类型的信息用不同的Field来表示,在本例子中,一共有三类信息进行了索引,一个是
                        #文件路径,一个是文件名,一个是文件内容。

                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
                except Exception, e:
                    print "Failed in indexDocs:", e

if __name__ == '__main__':
##    if len(sys.argv) < 2:
##        print IndexFiles.__doc__
##        sys.exit(1)
    lucene.initVM() #初始化Java虚拟机
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
##        IndexFiles(sys.argv[1], "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
        IndexFiles('testfolder', "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
Ejemplo n.º 12
0
    def addMessage(self, username, xprotocol, xfriend_chat, who_sent,
                   timestamp, text):
        #Clean up protocol and friend_chat fields
        """ For some unknown reason, PyLucene (and probably Lucene as well)
            seems to have problems searching for things like SoAndSo but
            has no problems searching for soandso. To prevent headaches in
            the future we simply set it all to lowercase since the case
            does not matter for these fields."""
        protocol = xprotocol.lower()
        friend_chat = xfriend_chat.lower()

        #Determine index and data paths
        index_dir = self.indexdir + username
        data_dir = self.datadir + username + PATH_SEP + protocol + PATH_SEP
        data_file = data_dir + friend_chat

        #if the index doesn't exist, we use a sepcial constructor to create it
        if os.path.isdir(index_dir) == False:
            os.makedirs(index_dir)
            luc_index = lucene.FSDirectory.getDirectory(index_dir, True)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer(), True)
        else:
            luc_index = lucene.FSDirectory.getDirectory(index_dir)
            luc_writer = lucene.IndexWriter(luc_index,
                                            lucene.StandardAnalyzer())
        #Opening the index before writing to the file gives us a lock
        #on the index. As long as writing to data files occurs only
        #through this function, this is guaranteed to be an atomic
        #operation. Closing the writer releases the lock.

        if os.path.isdir(data_dir) == False:
            os.makedirs(data_dir)
        #filesize is used to determine the file offset
        if os.path.isfile(data_file) == False:
            filesize = 0
        else:
            filesize = os.path.getsize(data_file)

        datahandle = open(data_file, 'a')
        datahandle.write(str(who_sent))
        datahandle.write("\n")
        datahandle.write(str(timestamp))
        datahandle.write("\n")
        datahandle.write(str(len(str(text))))  #what a mess
        datahandle.write("\n")
        datahandle.write(str(text))
        datahandle.write("\n")

        doc = lucene.Document()
        doc.add(self.__makeKeywordField('protocol', str(protocol)))
        doc.add(self.__makeKeywordField('friend_chat', str(friend_chat)))
        clean_timestamp = self.__padTimestamp(timestamp)
        doc.add(self.__makeKeywordField('timestamp', clean_timestamp))
        doc.add(self.__makeKeywordField('who_sent', str(who_sent)))
        doc.add(self.__makeUnIndexedField('file_offset', str(filesize)))
        clean_text = re.sub("<[^>]*>", " ", str(text))
        doc.add(self.__makeUnStoredField('text', clean_text))

        luc_writer.addDocument(doc)
        luc_writer.close()
Ejemplo n.º 13
0
    index_file = siteconfig.get("search_index_file")
    if lucene_is_2x:
        store = lucene.FSDirectory.getDirectory(index_file, False)
    elif lucene_is_3x:
        store = lucene.FSDirectory.open(lucene.File(index_file))
    else:
        assert False

    try:
        searcher = lucene.IndexSearcher(store)
    except lucene.JavaError, e:
        # FIXME: show a useful error
        raise e

    if lucene_is_2x:
        parser = lucene.QueryParser('text', lucene.StandardAnalyzer())
        result_ids = [int(lucene.Hit.cast_(hit).getDocument().get('id')) \
                      for hit in searcher.search(parser.parse(query))]
    elif lucene_is_3x:
        parser = lucene.QueryParser(
            lucene.Version.LUCENE_CURRENT, 'text',
            lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
        result_ids = [searcher.doc(hit.doc).get('id') \
                      for hit in searcher.search(parser.parse(query), 100).scoreDocs]

    searcher.close()

    results = ReviewRequest.objects.filter(id__in=result_ids,
                                           local_site__name=local_site_name)

    return object_list(request=request,
Ejemplo n.º 14
0
    def handle_noargs(self, **options):
        siteconfig = SiteConfiguration.objects.get_current()

        # Refuse to do anything if they haven't turned on search.
        if not siteconfig.get("search_enable"):
            sys.stderr.write('Search is currently disabled. It must be '
                             'enabled in the Review Board administration '
                             'settings to run this command.\n')
            sys.exit(1)

        if not have_lucene:
            sys.stderr.write('PyLucene is required to build the search index.\n')
            sys.exit(1)

        incremental = options.get('incremental', True)

        store_dir = siteconfig.get("search_index_file")
        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
        timestamp_file = os.path.join(store_dir, 'timestamp')

        timestamp = 0
        if incremental:
            try:
                f = open(timestamp_file, 'r')
                timestamp = datetime.utcfromtimestamp(int(f.read()))
                f.close()
            except IOError:
                incremental = False

        f = open(timestamp_file, 'w')
        f.write('%d' % time.time())
        f.close()

        if lucene_is_2x:
            store = lucene.FSDirectory.getDirectory(store_dir, False)
            writer = lucene.IndexWriter(store, False,
                                        lucene.StandardAnalyzer(),
                                        not incremental)
        elif lucene_is_3x:
            store = lucene.FSDirectory.open(lucene.File(store_dir))
            writer = lucene.IndexWriter(store,
                lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT),
                not incremental,
                lucene.IndexWriter.MaxFieldLength.LIMITED)
        else:
            assert False

        status = Q(status='P') | Q(status='S')
        objects = ReviewRequest.objects.filter(status)
        if incremental:
            query = Q(last_updated__gt=timestamp)
            # FIXME: re-index based on reviews once reviews are indexed.  I
            # tried ORing this in, but it doesn't seem to work.
            #        Q(review__timestamp__gt=timestamp)
            objects = objects.filter(query)

        if sys.stdout.isatty():
            print 'Creating Review Request Index'
        totalobjs = objects.count()
        i = 0
        prev_pct = -1

        for request in objects:
            try:
                # Remove the old documents from the index
                if incremental:
                    writer.deleteDocuments(lucene.Term('id', str(request.id)))

                self.index_review_request(writer, request)

                if sys.stdout.isatty():
                    i += 1
                    pct = (i * 100 / totalobjs)
                    if pct != prev_pct:
                        sys.stdout.write("  [%s%%]\r" % pct)
                        sys.stdout.flush()
                        prev_pct = pct

            except Exception, e:
                sys.stderr.write('Error indexing ReviewRequest #%d: %s\n' % \
                                 (request.id, e))
Ejemplo n.º 15
0
 def __init__(self, dir_file_path):
     lucene.initVM()
     self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path))
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30)
     self.search = lucene.IndexSearcher(self.directory)
Ejemplo n.º 16
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Ejemplo n.º 17
0
import sys
sys.path.append("..")
import util
from util.rake import Rake

print("load vm")
index_dir = '../../data/index/'

#搭配地址
location_dir = '../../data/location/'

lucene.initVM()

directory = lucene.SimpleFSDirectory(lucene.File(index_dir))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

directory1 = lucene.SimpleFSDirectory(lucene.File(location_dir))
analyzer1 = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

rake = Rake("../../data/SmartStoplist.txt")


def search(word):
    print("searching ")

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    searcher = lucene.IndexSearcher(directory, True)
    query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, 'eng',
Ejemplo n.º 18
0
    def __getSurroundingMessages(self, when, searcher, username, protocol,
                                 friend_chat, timestamp, docid):
        #Determine the query text
        if when == "before":
            searchstart = self.__padTimestamp(timestamp - SECONDS_IN_5_MINUTES)
            searchend = self.__padTimestamp(timestamp)
        else:
            searchstart = self.__padTimestamp(timestamp)
            searchend = self.__padTimestamp(timestamp + SECONDS_IN_5_MINUTES)
        querytext = "timestamp:[" + searchstart + " TO " + searchend + "]"
        querytext += ' AND protocol:"' + protocol + '"'
        querytext += ' AND friend_chat:"' + friend_chat + '"'

        #Build and perform the query
        qparser = lucene.QueryParser("text", lucene.StandardAnalyzer())
        query = qparser.parse(querytext)
        sortmethod = lucene.Sort("timestamp")
        qresults = searcher.search(query, sortmethod)

        #Determine which results to include
        if when == "before":
            rangestart = 0
        else:
            if qresults.length() > 5:
                rangestart = qresults.length() - 5
            else:
                rangestart = 0
        #We cant assume the results will exclude messages outside the
        #range we are looking for in the case that many messages in
        #the conversation have the identical timestamp. We just will
        #just deal with it in the for loop
        rangeend = qresults.length()

        #Fetch the results
        messagelist = []
        ignore = False
        for i in range(rangestart, rangeend):
            mid = int(qresults.id(i))
            if mid == docid:
                if when == "before":
                    #We ran into the reference point message, this means we
                    #don't need to capture any more and we can return
                    break
                else:
                    #We ran into the reference point message
                    #The easiest thing to do is declare all messages we have
                    #found so far as null and reset the list to be returned
                    #Also, stop ignoring if we reached 5 messages before now
                    messagelist = []
                    ignore == False
            elif ignore == False:
                mtimestamp = int(qresults.doc(i).get("timestamp"))
                mwho_sent = qresults.doc(i).get("who_sent")
                mfileoffset = int(qresults.doc(i).get("file_offset"))
                messagetext = self.__getMessageFromFile(
                    username, friend_chat, protocol, mfileoffset)
                message = LogMessage(messagetext, mtimestamp, mwho_sent)
                message.setID(mid)
                messagelist.append(message)

                #Only allow up to 5 messages
                if len(messagelist) == 5:
                    #Setting an ignore flag allows us to deal with cases
                    #like when our reference point message is the 7th
                    #message in a string that all have the same timestamp
                    #and we are trying to get 5 messages after it
                    ignore = True
        return messagelist
Ejemplo n.º 19
0
    def searchMessages(self, username, querytext):
        #Determine index and data paths
        index_dir = self.indexdir + username
        data_dir = self.datadir + username

        #Load the index
        if os.path.isdir(index_dir) == True:
            luc_index = lucene.FSDirectory.getDirectory(index_dir)

            #Build and perform the query
            searcher = lucene.IndexSearcher(luc_index)
            qparser = lucene.QueryParser("text", lucene.StandardAnalyzer())
            query = qparser.parse(querytext)
            qresults = searcher.search(query)

            #Fetch the results
            conversationlist = []
            for i in range(qresults.length()):
                mid = int(qresults.id(i))
                mprotocol = qresults.doc(i).get("protocol")
                mfriend_chat = qresults.doc(i).get("friend_chat")
                mtimestamp = int(qresults.doc(i).get("timestamp"))
                mwho_sent = qresults.doc(i).get("who_sent")
                mfileoffset = int(qresults.doc(i).get("file_offset"))
                mrank = qresults.score(i)

                #First check if it exists in one of the previously matched
                #conversations
                found = False
                for j in range(len(conversationlist)):
                    for k in range(len(conversationlist[j].messages)):
                        if conversationlist[j].messages[k].getID() == mid:
                            #Match found, so just update the messages rank
                            conversationlist[j].messages[k].setRank(mrank)
                            found = True

                #If no match was found, create a new conversation
                if found == False:
                    #Create a conversation for each result
                    conversation = LogConversation(mprotocol, mfriend_chat)

                    messagetext = self.__getMessageFromFile(
                        username, mfriend_chat, mprotocol, mfileoffset)
                    before_msgs = self.__getSurroundingMessages(
                        "before", searcher, username, mprotocol, mfriend_chat,
                        mtimestamp, mid)
                    for j in range(len(before_msgs)):
                        conversation.addMessage(before_msgs[j])
                    message = LogMessage(messagetext, mtimestamp, mwho_sent)
                    message.setRank(mrank)
                    message.setID(mid)
                    conversation.addMessage(message)
                    after_msgs = self.__getSurroundingMessages(
                        "after", searcher, username, mprotocol, mfriend_chat,
                        mtimestamp, mid)
                    for j in range(len(after_msgs)):
                        conversation.addMessage(after_msgs[j])

                    conversationlist.append(conversation)
            #End of fetching each result

            return conversationlist
        else:
            #Index not found
            return False
Ejemplo n.º 20
0
						lucene.Field.Store.YES,
						lucene.Field.Index.NOT_ANALYZED))
					doc.add(lucene.Field("path", path,
						lucene.Field.Store.YES,
						lucene.Field.Index.NOT_ANALYZED))
					if len(contents) > 0:
						doc.add(lucene.Field("contents", contents,
							lucene.Field.Store.NO,
							lucene.Field.Index.ANALYZED))
					else:
						print("no content")
						print("warning: no content in %s" % filename)
					writer.addDocument(doc)
				except Exception, e:
					print("Failed in indexDocs:" + e)

if __name__ == '__main__':
	if len(sys.argv) < 2:
		print("here")
		print IndexFiles.__doc__
		sys.exit(1)
	lucene.initVM()
	print("lucene", lucene.VERSION)
	start = datetime.now()
	try:
		IndexFiles(sys.argv[1], "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
		end = datetime.now()
		print end - start
		print("finished")
	except Exception, e:
		print "Failed: ", e
Ejemplo n.º 21
0
 def getAnalyzer(self):
     return lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)